未验证 提交 29e4dc5e 编写于 作者: O overlordmax 提交者: GitHub

Pr 05192151 (#4651)

* fix bugs

* fix bugs

* add wide_deep

* fix code style

* fix code style

* fix some bugs

* fix filename

* add ncf

* add download data

* add download data

* add youtube dnn

* edit README.md

* fix some bugs

* add listwise

* fix code style

* fix some bug
上级 1bf72647
...@@ -108,12 +108,3 @@ for i in range(sample_size): ...@@ -108,12 +108,3 @@ for i in range(sample_size):
feed_var_names = ["query", "doc_pos"] feed_var_names = ["query", "doc_pos"]
fetch_vars = [R_Q_D_p] fetch_vars = [R_Q_D_p]
fluid.io.save_inference_model(args.model_dir, feed_var_names, fetch_vars, exe) fluid.io.save_inference_model(args.model_dir, feed_var_names, fetch_vars, exe)
...@@ -14,7 +14,6 @@ def infer(args): ...@@ -14,7 +14,6 @@ def infer(args):
with fluid.scope_guard(fluid.Scope()): with fluid.scope_guard(fluid.Scope()):
infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(args.model_dir, exe) infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(args.model_dir, exe)
#构造测试数据
sample_size = 100 sample_size = 100
l_Qs = [] l_Qs = []
pos_l_Ds = [] pos_l_Ds = []
......
python infer.py --use_gpu 0 \ python infer.py --use_gpu 0 \
--model_dir 'model_dir' --model_dir ./model_dir
\ No newline at end of file \ No newline at end of file
CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 \ CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 \
--model_dir 'model_dir' --model_dir ./model_dir
\ No newline at end of file \ No newline at end of file
...@@ -6,4 +6,4 @@ python dssm.py --use_gpu 0 \ ...@@ -6,4 +6,4 @@ python dssm.py --use_gpu 0 \
--L3_N 128 \ --L3_N 128 \
--Neg 4 \ --Neg 4 \
--base_lr 0.01 \ --base_lr 0.01 \
--model_dir 'model_dir' --model_dir ./model_dir
\ No newline at end of file \ No newline at end of file
...@@ -6,4 +6,4 @@ CUDA_VISIBLE_DEVICES=0 python dssm.py --use_gpu 1 \ ...@@ -6,4 +6,4 @@ CUDA_VISIBLE_DEVICES=0 python dssm.py --use_gpu 1 \
--L3_N 128 \ --L3_N 128 \
--Neg 4 \ --Neg 4 \
--base_lr 0.01 \ --base_lr 0.01 \
--model_dir 'model_dir' --model_dir ./model_dir
\ No newline at end of file \ No newline at end of file
...@@ -7,7 +7,6 @@ ...@@ -7,7 +7,6 @@
├── net.py # ESMM网络结构 ├── net.py # ESMM网络结构
├── train.py # ESMM模型训练脚本 ├── train.py # ESMM模型训练脚本
├── infer.py # ESMM模型预测脚本 ├── infer.py # ESMM模型预测脚本
├── reader.py # 数据预处理文件
├── utils.py # 通用函数 ├── utils.py # 通用函数
├── args.py # 参数脚本 ├── args.py # 参数脚本
├── get_data.sh # 生成训练数据脚本 ├── get_data.sh # 生成训练数据脚本
...@@ -16,6 +15,7 @@ ...@@ -16,6 +15,7 @@
├── cpu_train.sh # cpu训练shell脚本 ├── cpu_train.sh # cpu训练shell脚本
├── gpu_infer.sh # gpu预测shell脚本 ├── gpu_infer.sh # gpu预测shell脚本
├── cpu_infer.sh # cpu预测shell脚本 ├── cpu_infer.sh # cpu预测shell脚本
├── vocab_size.txt #词汇表文件
``` ```
## 简介 ## 简介
...@@ -50,14 +50,14 @@ GPU环境 ...@@ -50,14 +50,14 @@ GPU环境
在gpu_train.sh脚本文件中设置好数据路径、参数。 在gpu_train.sh脚本文件中设置好数据路径、参数。
```shell ```shell
CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu True\ #是否使用gpu CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1\ #是否使用gpu
--epochs 100\ #训练轮次 --epochs 100\ #训练轮次
--batch_size 64\ #batch_size大小 --batch_size 64\ #batch_size大小
--embed_size 12\ #每个featsigns的embedding维度 --embed_size 12\ #每个featsigns的embedding维度
--cpu_num 2\ #cpu数量 --cpu_num 2\ #cpu数量
--model_dir ./model_dir \ #模型保存路径 --model_dir ./model_dir \ #模型保存路径
--train_data_path ./train_data \ #训练数据路径 --train_data_path ./train_data \ #训练数据路径
--vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 --vocab_path ./vocab_size.txt #embedding词汇表大小路径
``` ```
修改脚本的可执行权限并运行 修改脚本的可执行权限并运行
...@@ -71,14 +71,14 @@ CPU环境 ...@@ -71,14 +71,14 @@ CPU环境
在cpu_train.sh脚本文件中设置好数据路径、参数。 在cpu_train.sh脚本文件中设置好数据路径、参数。
```shell ```shell
python train.py --use_gpu False\ #是否使用gpu python train.py --use_gpu 0\ #是否使用gpu
--epochs 100\ #训练轮次 --epochs 100\ #训练轮次
--batch_size 64\ #batch_size大小 --batch_size 64\ #batch_size大小
--embed_size 12\ #每个featsigns的embedding维度 --embed_size 12\ #每个featsigns的embedding维度
--cpu_num 2\ #cpu数量 --cpu_num 2\ #cpu数量
--model_dir ./model_dir \ #模型保存路径 --model_dir ./model_dir \ #模型保存路径
--train_data_path ./train_data \ #训练数据路径 --train_data_path ./train_data \ #训练数据路径
--vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 --vocab_path ./vocab_size.txt #embedding词汇表大小路径
``` ```
修改脚本的可执行权限并运行 修改脚本的可执行权限并运行
...@@ -94,10 +94,10 @@ GPU环境 ...@@ -94,10 +94,10 @@ GPU环境
在gpu_infer.sh脚本文件中设置好数据路径、参数。 在gpu_infer.sh脚本文件中设置好数据路径、参数。
```sh ```sh
python infer.py --use_gpu True\ #是否使用gpu python infer.py --use_gpu 1\ #是否使用gpu
--batch_size 64\ #batch_size大小 --batch_size 64\ #batch_size大小
--test_data_path ./test_data \ #训练数据路径 --test_data_path ./test_data \ #训练数据路径
--vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 --vocab_path ./vocab_size.txt #embedding词汇表大小路径
``` ```
修改脚本的可执行权限并运行 修改脚本的可执行权限并运行
...@@ -111,11 +111,11 @@ CPU环境 ...@@ -111,11 +111,11 @@ CPU环境
在cpu_infer.sh脚本文件中设置好数据路径、参数。 在cpu_infer.sh脚本文件中设置好数据路径、参数。
```shell ```shell
python infer.py --use_gpu False\ #是否使用gpu python infer.py --use_gpu 0\ #是否使用gpu
--batch_size 64\ #batch_size大小 --batch_size 64\ #batch_size大小
--cpu_num 2\ #cpu数量 --cpu_num 2\ #cpu数量
--test_data_path ./test_data \ #训练数据路径 --test_data_path ./test_data \ #训练数据路径
--vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 --vocab_path ./vocab_size.txt #embedding词汇表大小路径
``` ```
修改脚本的可执行权限并运行 修改脚本的可执行权限并运行
......
...@@ -27,12 +27,12 @@ def parse_args(): ...@@ -27,12 +27,12 @@ def parse_args():
parser.add_argument("--batch_size", type=int, default=64, help="batch_size") parser.add_argument("--batch_size", type=int, default=64, help="batch_size")
parser.add_argument("--embed_size", type=int, default=12, help="embed_size") parser.add_argument("--embed_size", type=int, default=12, help="embed_size")
parser.add_argument("--cpu_num", type=int, default=2, help="cpu_num") parser.add_argument("--cpu_num", type=int, default=2, help="cpu_num")
parser.add_argument('--use_gpu', type=bool, default=False, help='whether using gpu') parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu')
parser.add_argument('--model_dir', type=str, default='./model_dir', help='whether using gpu') parser.add_argument('--model_dir', type=str, default='./model_dir', help='whether using gpu')
parser.add_argument('--train_data_path', type=str, default='./train_data', help='train_data_path') parser.add_argument('--train_data_path', type=str, default='./train_data', help='train_data_path')
parser.add_argument('--test_data_path', type=str, default='./test_data', help='test_data_path') parser.add_argument('--test_data_path', type=str, default='./test_data', help='test_data_path')
parser.add_argument('--vocab_path', type=str, default='./vocab/vocab_size.txt', help='vocab_path') parser.add_argument('--vocab_path', type=str, default='./vocab_size.txt', help='vocab_path')
parser.add_argument("--train_sample_size", type=int, default=sys.maxsize, help="train_sample_size") parser.add_argument("--train_sample_size", type=int, default=sys.maxsize, help="train_sample_size")
parser.add_argument("--test_sample_size", type=int, default=sys.maxsize, help="test_sample_size") parser.add_argument("--test_sample_size", type=int, default=sys.maxsize, help="test_sample_size")
......
python infer.py --use_gpu False\ #是否使用gpu python infer.py --use_gpu 0 \
--batch_size 64\ #batch_size大小 --batch_size 64 \
--cpu_num 2\ #cpu数量 --cpu_num 2 \
--test_data_path ./test_data \ #训练数据路径 --test_data_path ./test_data \
--vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 --vocab_path ./vocab_size.txt
\ No newline at end of file \ No newline at end of file
python train.py --use_gpu False\ #是否使用gpu python train.py --use_gpu 0 \
--epochs 100\ #训练轮次 --epochs 100 \
--batch_size 64\ #batch_size大小 --batch_size 64 \
--embed_size 12\ #每个featsigns的embedding维度 --embed_size 12 \
--cpu_num 2\ #cpu数量 --cpu_num 2 \
--model_dir ./model_dir \ #模型保存路径 --model_dir ./model_dir \
--train_data_path ./train_data \ #训练数据路径 --train_data_path ./train_data \
--vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 --vocab_path ./vocab_size.txt
\ No newline at end of file \ No newline at end of file
mkdir train_data mkdir train_data
mkdir test_data mkdir test_data
mkdir vocab
mkdir data wget -P train_data/ https://paddlerec.bj.bcebos.com/esmm/traindata.csv
train_source_path="./data/sample_train.tar.gz" wget -P test_data/ https://paddlerec.bj.bcebos.com/esmm/testdata.csv
train_target_path="train_data"
test_source_path="./data/sample_test.tar.gz"
test_target_path="test_data"
cd data
echo "downloading sample_train.tar.gz......"
curl -# 'http://jupter-oss.oss-cn-hangzhou.aliyuncs.com/file/opensearch/documents/408/sample_train.tar.gz?Expires=1586435769&OSSAccessKeyId=LTAIGx40tjZWxj6q&Signature=ahUDqhvKT1cGjC4%2FIER2EWtq7o4%3D&response-content-disposition=attachment%3B%20' -H 'Proxy-Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' -H 'Accept-Language: zh-CN,zh;q=0.9' --compressed --insecure -o sample_train.tar.gz
cd ..
echo "unzipping sample_train.tar.gz......"
tar -xzvf ${train_source_path} -C ${train_target_path} && rm -rf ${train_source_path}
cd data
echo "downloading sample_test.tar.gz......"
curl -# 'http://jupter-oss.oss-cn-hangzhou.aliyuncs.com/file/opensearch/documents/408/sample_test.tar.gz?Expires=1586435821&OSSAccessKeyId=LTAIGx40tjZWxj6q&Signature=OwLMPjt1agByQtRVi8pazsAliNk%3D&response-content-disposition=attachment%3B%20' -H 'Proxy-Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' -H 'Accept-Language: zh-CN,zh;q=0.9' --compressed --insecure -o sample_test.tar.gz
cd ..
echo "unzipping sample_test.tar.gz......"
tar -xzvf ${test_source_path} -C ${test_target_path} && rm -rf ${test_source_path}
echo "preprocessing data......"
python reader.py --train_data_path ${train_target_path} \
--test_data_path ${test_target_path} \
--vocab_path vocab/vocab_size.txt \
--train_sample_size 6400 \
--test_sample_size 6400 \
python infer.py --use_gpu True\ #是否使用gpu python infer.py --use_gpu 1\
--batch_size 64\ #batch_size大小 --batch_size 64\
--test_data_path ./test_data \ #训练数据路径 --test_data_path ./test_data\
--vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 --vocab_path ./vocab_size.txt
\ No newline at end of file \ No newline at end of file
CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu True\ #是否使用gpu CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu True\
--epochs 100\ #训练轮次 --epochs 100\
--batch_size 64\ #batch_size大小 --batch_size 64\
--embed_size 12\ #每个featsigns的embedding维度 --embed_size 12\
--cpu_num 2\ #cpu数量 --cpu_num 2\
--model_dir ./model_dir \ #模型保存路径 --model_dir './model_dir'\
--train_data_path ./train_data \ #训练数据路径 --train_data_path './train_data'\
--vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 --vocab_path './vocab/vocab_size.txt'
\ No newline at end of file \ No newline at end of file
...@@ -21,7 +21,7 @@ def set_zero(place): ...@@ -21,7 +21,7 @@ def set_zero(place):
param_array = np.zeros(param._get_dims()).astype("int64") param_array = np.zeros(param._get_dims()).astype("int64")
param.set(param_array, place) param.set(param_array, place)
def run_infer(args,model_path,test_data_path,vocab_size): def run_infer(args, model_path, test_data_path, vocab_size):
place = fluid.CPUPlace() place = fluid.CPUPlace()
esmm_model = ESMM() esmm_model = ESMM()
...@@ -33,10 +33,10 @@ def run_infer(args,model_path,test_data_path,vocab_size): ...@@ -33,10 +33,10 @@ def run_infer(args,model_path,test_data_path,vocab_size):
inputs = esmm_model.input_data() inputs = esmm_model.input_data()
avg_cost,auc_ctr,auc_ctcvr= esmm_model.net(inputs, vocab_size, args.embed_size) avg_cost,auc_ctr,auc_ctcvr= esmm_model.net(inputs, vocab_size, args.embed_size)
dataset, file_list = utils.get_dataset(inputs, test_data_path,args.batch_size,args.cpu_num) dataset, file_list = utils.get_dataset(inputs, test_data_path, args.batch_size,args.cpu_num)
exe = fluid.Executor(place) exe = fluid.Executor(place)
fluid.load(fluid.default_main_program(),os.path.join(model_path, "checkpoint"), exe) fluid.load(fluid.default_main_program(), os.path.join(model_path, "checkpoint"), exe)
set_zero(place) set_zero(place)
......
...@@ -7,7 +7,7 @@ import args ...@@ -7,7 +7,7 @@ import args
class ESMM(object): class ESMM(object):
def fc(self,tag, data, out_dim, active='prelu'): def fc(self, tag, data, out_dim, active='prelu'):
init_stddev = 1.0 init_stddev = 1.0
scales = 1.0 / np.sqrt(data.shape[1]) scales = 1.0 / np.sqrt(data.shape[1])
...@@ -35,7 +35,7 @@ class ESMM(object): ...@@ -35,7 +35,7 @@ class ESMM(object):
return inputs return inputs
def net(self,inputs,vocab_size,embed_size): def net(self, inputs, vocab_size, embed_size):
emb = [] emb = []
for data in inputs[0:-2]: for data in inputs[0:-2]:
...@@ -47,7 +47,7 @@ class ESMM(object): ...@@ -47,7 +47,7 @@ class ESMM(object):
), ),
is_sparse=True) is_sparse=True)
field_emb = fluid.layers.sequence_pool(input=feat_emb,pool_type='sum') field_emb = fluid.layers.sequence_pool(input=feat_emb, pool_type='sum')
emb.append(field_emb) emb.append(field_emb)
concat_emb = fluid.layers.concat(emb, axis=1) concat_emb = fluid.layers.concat(emb, axis=1)
...@@ -60,7 +60,7 @@ class ESMM(object): ...@@ -60,7 +60,7 @@ class ESMM(object):
# cvr # cvr
cvr_fc1 = self.fc('cvr_fc1', concat_emb, 200, active) cvr_fc1 = self.fc('cvr_fc1', concat_emb, 200, active)
cvr_fc2 = self.fc('cvr_fc2', cvr_fc1, 80, active) cvr_fc2 = self.fc('cvr_fc2', cvr_fc1, 80, active)
cvr_out = self.fc('cvr_out', cvr_fc2, 2,'softmax') cvr_out = self.fc('cvr_out', cvr_fc2, 2, 'softmax')
ctr_clk = inputs[-2] ctr_clk = inputs[-2]
ctcvr_buy = inputs[-1] ctcvr_buy = inputs[-1]
...@@ -69,10 +69,10 @@ class ESMM(object): ...@@ -69,10 +69,10 @@ class ESMM(object):
cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2]) cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2])
ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one) ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one)
ctcvr_prop = fluid.layers.concat(input=[1-ctcvr_prop_one,ctcvr_prop_one], axis = 1) ctcvr_prop = fluid.layers.concat(input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis = 1)
loss_ctr = paddle.fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk) loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk)
loss_ctcvr = paddle.fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy) loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy)
cost = loss_ctr + loss_ctcvr cost = loss_ctr + loss_ctcvr
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
......
import numpy as np
import pandas as pd
from collections import defaultdict
import args
import os
def join_data(file1,file2,write_file,sample_size):
sample_list = []
common_logs = defaultdict(lambda: '')
file = open(write_file, 'w')
print("begin push sample_list!")
with open(file1,'r') as f:
for i, line in enumerate(f):
try:
sample_list.append(line)
except:
continue
print("begin push common_logs!")
with open(file2,'r') as f:
for i, line in enumerate(f):
try:
common_feature_index,sample_str = line.strip().split('\t')
common_logs[common_feature_index] = sample_str
except:
continue
print("begin join data!")
for i, sample in enumerate(sample_list):
try:
common_feature_index,sample_str = sample.strip().split('\t')
common_str = common_logs.get(common_feature_index)
if common_str:
sample = "{0},{1}".format(sample_str, common_str)
else:
sample = "{0}".format(sample_str)
file.write(sample + "\n")
except:
continue
if(i == sample_size):
break
print("join data successfully!")
def read_data(file_name,write_file):
file = open(write_file, 'w')
print("begin to write!")
with open(file_name,'r') as f:
for i, line in enumerate(f):
try:
line = line.strip().split(',')
feat_len = len(line)
feat_lists = []
#common_feature_index|feat_num|feat_list
if(feat_len == 3):
feat_strs = line[2]
for fstr in feat_strs.split('\x01'):
filed, feat_val = fstr.split('\x02')
feat, val = feat_val.split('\x03')
feat_lists.append('%s:%s' % (filed,feat))
common_feature = "{0}\t{1}".format(line[0], ','.join(feat_lists)) + "\n"
file.write(common_feature)
#sample_id|y|z|common_feature_index|feat_num|feat_list
elif(feat_len == 6):
# y=0 & z=1 filter
if(line[1] == '0' and line[2] == '1'):
continue
feat_strs = line[5]
for fstr in feat_strs.split('\x01'):
filed, feat_val = fstr.split('\x02')
feat, val = feat_val.split('\x03')
feat_lists.append('%s:%s' % (filed,feat))
sample = "{0}\t{1},{2},{3},{4}".format(line[3], line[0], line[1], line[2], ','.join(feat_lists)) + "\n"
file.write(sample)
except:
continue
file.close()
def recode(file_path,writh_file,vocab_path):
all_feat_id_dict = defaultdict(int)
file1 = open(writh_file[0], 'w')
file2 = open(writh_file[1], 'w')
vocab_file = open(vocab_path, 'w')
id = 0
with open(file_path[0], "r") as f:
for i, line in enumerate(f):
line = line.strip().split(',')
feat_lists = []
for elem in line[3:]:
field_id,feat_id = elem.strip().split(':')
if feat_id not in all_feat_id_dict:
id += 1
all_feat_id_dict[feat_id] = id
feat_lists.append('%s:%s' % (field_id,all_feat_id_dict[feat_id]))
sample = "{0},{1},{2},{3}".format(line[0], line[1], line[2], ','.join(feat_lists)) + "\n"
file1.write(sample)
with open(file_path[1], "r") as f:
for i, line in enumerate(f):
line = line.strip().split(',')
feat_lists = []
for elem in line[3:]:
field_id,feat_id = elem.strip().split(':')
if feat_id not in all_feat_id_dict:
id += 1
all_feat_id_dict[feat_id] = id
feat_lists.append('%s:%s' % (field_id,all_feat_id_dict[feat_id]))
sample = "{0},{1},{2},{3}".format(line[0], line[1], line[2], ','.join(feat_lists)) + "\n"
file2.write(sample)
vocab_size =len(all_feat_id_dict)
vocab_file.write(str(vocab_size))
file1.close()
file2.close()
vocab_file.close()
if __name__ == "__main__":
args = args.parse_args()
read_data(args.train_data_path + '/sample_skeleton_train.csv',args.train_data_path + '/skeleton_train.csv')
print("write skeleton_train.csv successfully")
read_data(args.train_data_path + '/common_features_train.csv',args.train_data_path + '/features_train.csv')
print("write features_train.csv successfully")
skeleton_train_path = args.train_data_path + '/skeleton_train.csv'
features_train_path = args.train_data_path + '/features_train.csv'
write_file = args.train_data_path + '/train_data.csv'
join_data(skeleton_train_path,features_train_path,write_file,args.train_sample_size)
os.system('rm -rf ' + skeleton_train_path)
os.system('rm -rf ' + features_train_path)
read_data(args.test_data_path + '/sample_skeleton_test.csv',args.test_data_path + '/skeleton_test.csv')
print("write skeleton_est.csv successfully")
read_data(args.test_data_path + '/common_features_test.csv',args.test_data_path + '/features_test.csv')
print("write features_test.csv successfully")
skeleton_test_path = args.test_data_path + '/skeleton_test.csv'
features_test_path = args.test_data_path + '/features_test.csv'
write_file = args.test_data_path + '/test_data.csv'
join_data(skeleton_test_path,features_test_path,write_file,args.test_sample_size)
os.system('rm -rf ' + skeleton_test_path)
os.system('rm -rf ' + features_test_path)
file_path = [args.train_data_path + '/train_data.csv', args.test_data_path + '/test_data.csv']
write_file = [args.train_data_path + '/traindata.csv',args.test_data_path + '/testdata.csv']
recode(file_path,write_file,args.vocab_path)
for file in file_path:
os.system('rm -rf ' + file_path)
...@@ -10,9 +10,9 @@ def train(args, vocab_size, train_data_path): ...@@ -10,9 +10,9 @@ def train(args, vocab_size, train_data_path):
esmm_model = ESMM() esmm_model = ESMM()
inputs = esmm_model.input_data() inputs = esmm_model.input_data()
dataset, file_list = utils.get_dataset(inputs, train_data_path,args.batch_size,args.cpu_num) dataset, file_list = utils.get_dataset(inputs, train_data_path, args.batch_size,args.cpu_num)
avg_cost,auc_ctr,auc_ctcvr= esmm_model.net(inputs, vocab_size, args.embed_size) avg_cost, auc_ctr, auc_ctcvr = esmm_model.net(inputs, vocab_size, args.embed_size)
optimizer = fluid.optimizer.Adam() optimizer = fluid.optimizer.Adam()
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
...@@ -29,11 +29,11 @@ def train(args, vocab_size, train_data_path): ...@@ -29,11 +29,11 @@ def train(args, vocab_size, train_data_path):
dataset.set_filelist(file_list) dataset.set_filelist(file_list)
exe.train_from_dataset(program=fluid.default_main_program(), exe.train_from_dataset(program=fluid.default_main_program(),
dataset=dataset, dataset=dataset,
fetch_list=[avg_cost,auc_ctr,auc_ctcvr], fetch_list=[avg_cost, auc_ctr, auc_ctcvr],
fetch_info=['epoch %d batch loss' % (epoch), "auc_ctr","auc_ctcvr"], fetch_info=['epoch %d batch loss' % (epoch), "auc_ctr", "auc_ctcvr"],
print_period=20, print_period=20,
debug=False) debug=False)
model_dir = os.path.join(args.model_dir,'epoch_' + str(epoch + 1), "checkpoint") model_dir = os.path.join(args.model_dir, 'epoch_' + str(epoch + 1), "checkpoint")
main_program = fluid.default_main_program() main_program = fluid.default_main_program()
fluid.io.save(main_program,model_dir) fluid.io.save(main_program,model_dir)
......
129590
\ No newline at end of file
...@@ -35,9 +35,8 @@ GPU环境 ...@@ -35,9 +35,8 @@ GPU环境
```sh ```sh
CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1\ #使用gpu CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1\ #使用gpu
--epochs 3\
--batch_size 32\ --batch_size 32\
--model_dir './model_dir'\ #模型保存路径 --model_dir ./model_dir\ #模型保存路径
--embd_dim 16\ #embedding维度 --embd_dim 16\ #embedding维度
--hidden_size 128\ #biRNN隐层大小 --hidden_size 128\ #biRNN隐层大小
--item_vocab 200\ #item词典大小 --item_vocab 200\ #item词典大小
...@@ -60,9 +59,8 @@ CPU环境 ...@@ -60,9 +59,8 @@ CPU环境
```sh ```sh
python train.py --use_gpu 0\ #使用cpu python train.py --use_gpu 0\ #使用cpu
--epochs 3\
--batch_size 32\ --batch_size 32\
--model_dir './model_dir'\ #模型保存路径 --model_dir ./model_dir\ #模型保存路径
--embd_dim 16\ #embedding维度 --embd_dim 16\ #embedding维度
--hidden_size 128\ #biRNN隐层大小 --hidden_size 128\ #biRNN隐层大小
--item_vocab 200\ #item词典大小 --item_vocab 200\ #item词典大小
...@@ -87,8 +85,8 @@ GPU环境 ...@@ -87,8 +85,8 @@ GPU环境
```sh ```sh
CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 \ #使用gpu CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 \ #使用gpu
--model_dir './model_dir'\ --model_dir ./model_dir\
--test_epoch 19 #选择哪一轮的模型参数 --test_epoch 1 #选择哪一轮的模型参数
``` ```
......
...@@ -24,7 +24,7 @@ def parse_args(): ...@@ -24,7 +24,7 @@ def parse_args():
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--epochs", type=int, default=20, help="epochs") parser.add_argument("--epochs", type=int, default=20, help="epochs")
parser.add_argument("--batch_size", type=int, default=32, help="batch_size") parser.add_argument("--batch_size", type=int, default=32, help="batch_size")
parser.add_argument("--test_epoch", type=int, default=19, help="test_epoch") parser.add_argument("--test_epoch", type=int, default=1, help="test_epoch")
parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu') parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu')
parser.add_argument('--model_dir', type=str, default='./model_dir', help='model_dir') parser.add_argument('--model_dir', type=str, default='./model_dir', help='model_dir')
parser.add_argument('--embd_dim', type=int, default=16, help='embd_dim') parser.add_argument('--embd_dim', type=int, default=16, help='embd_dim')
......
...@@ -16,9 +16,6 @@ class BiRNN(object): ...@@ -16,9 +16,6 @@ class BiRNN(object):
def default_normal_initializer(self, nf=128): def default_normal_initializer(self, nf=128):
return fluid.initializer.TruncatedNormal(loc=0.0, scale=np.sqrt(1.0/nf)) return fluid.initializer.TruncatedNormal(loc=0.0, scale=np.sqrt(1.0/nf))
def default_param_clip(self):
return fluid.clip.GradientClipByValue(1.0)
def default_regularizer(self): def default_regularizer(self):
return None return None
...@@ -27,22 +24,18 @@ class BiRNN(object): ...@@ -27,22 +24,18 @@ class BiRNN(object):
size=size, size=size,
num_flatten_dims=num_flatten_dims, num_flatten_dims=num_flatten_dims,
param_attr=fluid.ParamAttr(initializer=self.default_normal_initializer(size), param_attr=fluid.ParamAttr(initializer=self.default_normal_initializer(size),
gradient_clip=self.default_param_clip(),
regularizer=self.default_regularizer()), regularizer=self.default_regularizer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(value=0.0), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(value=0.0),
gradient_clip=self.default_param_clip(),
regularizer=self.default_regularizer()), regularizer=self.default_regularizer()),
act=act, act=act,
name=name) name=name)
def default_embedding(self, data, vocab_size, embed_size): def default_embedding(self, data, vocab_size, embed_size):
gradient_clip = self.default_param_clip()
reg = fluid.regularizer.L2Decay(1e-5) # IMPORTANT, to prevent overfitting. reg = fluid.regularizer.L2Decay(1e-5) # IMPORTANT, to prevent overfitting.
embed = fluid.embedding(input=data, embed = fluid.embedding(input=data,
size=[vocab_size, embed_size], size=[vocab_size, embed_size],
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Xavier(), param_attr=fluid.ParamAttr(initializer=fluid.initializer.Xavier(),
gradient_clip=gradient_clip, regularizer=reg),
regularizer=reg),
is_sparse=True) is_sparse=True)
return embed return embed
...@@ -51,10 +44,8 @@ class BiRNN(object): ...@@ -51,10 +44,8 @@ class BiRNN(object):
return fluid.layers.dynamic_gru(input=data, return fluid.layers.dynamic_gru(input=data,
size=nf, size=nf,
param_attr=fluid.ParamAttr(initializer=self.default_normal_initializer(nf), param_attr=fluid.ParamAttr(initializer=self.default_normal_initializer(nf),
gradient_clip=self.default_param_clip(),
regularizer=self.default_regularizer()), regularizer=self.default_regularizer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(value=0.0), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(value=0.0),
gradient_clip=self.default_param_clip(),
regularizer=self.default_regularizer()), regularizer=self.default_regularizer()),
is_reverse=is_reverse, is_reverse=is_reverse,
h_0=h_0) h_0=h_0)
......
...@@ -11,6 +11,30 @@ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') ...@@ -11,6 +11,30 @@ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid") logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
user_id = 0
class Dataset(object):
def _reader_creator(self):
def reader():
global user_id
user_slot_name = []
for j in range(args.batch_size):
user_slot_name.append([user_id])
user_id += 1
item_slot_name = np.random.randint(args.item_vocab, size=(args.batch_size, args.item_len)).tolist()
lenght = [args.item_len]*args.batch_size
label = np.random.randint(2, size=(args.batch_size, args.item_len)).tolist()
output = []
output.append(user_slot_name)
output.append(item_slot_name)
output.append(lenght)
output.append(label)
yield output
return reader
def get_test_data(self):
return self._reader_creator()
def set_zero(var_name, scope=fluid.global_scope(), place=fluid.CPUPlace(), param_type="int64"): def set_zero(var_name, scope=fluid.global_scope(), place=fluid.CPUPlace(), param_type="int64"):
""" """
Set tensor of a Variable to zero. Set tensor of a Variable to zero.
...@@ -41,42 +65,23 @@ def run_infer(args): ...@@ -41,42 +65,23 @@ def run_infer(args):
for var in auc_states: # reset auc states for var in auc_states: # reset auc states
set_zero(var.name, scope=inference_scope, place=place) set_zero(var.name, scope=inference_scope, place=place)
# Build a random data set. test_data_generator = Dataset()
user_slot_names = [] test_reader = fluid.io.batch(test_data_generator.get_test_data(), batch_size=args.batch_size)
item_slot_names = [] loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=args.batch_size, iterable=True)
lens = [] loader.set_sample_list_generator(test_reader, places=place)
labels = []
user_id = 0
for i in range(args.sample_size):
user_slot_name = []
for j in range(args.batch_size):
user_slot_name.append(user_id)
user_id += 1
user_slot_names.append(user_slot_name)
item_slot_name = np.random.randint(args.item_vocab, size=(args.batch_size, args.item_len))
item_slot_names.append(item_slot_name)
lenght = np.array([args.item_len]*args.batch_size)
lens.append(lenght)
label = np.random.randint(2, size=(args.batch_size, args.item_len))
labels.append(label)
for i in range(args.sample_size): for i in range(args.sample_size):
begin = time.time() for batch_id, data in enumerate(loader()):
loss_val, auc = exe.run(test_program, begin = time.time()
feed={ loss_val, auc = exe.run(program=fluid.default_main_program(),
"user_slot_names": np.array(user_slot_names[i]).reshape(args.batch_size, 1), feed=data,
"item_slot_names": item_slot_names[i].astype('int64'), fetch_list=[loss.name, auc_val],
"lens": lens[i].astype('int64'), return_numpy=True)
"labels": labels[i].astype('int64') end = time.time()
}, logger.info("batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format(
return_numpy=True, batch_id, end-begin, float(np.array(loss_val)), float(np.array(auc))))
fetch_list=[loss.name, auc_val])
end = time.time()
logger.info("batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format(
end-begin, float(np.array(loss_val)), float(np.array(auc))))
if __name__ == "__main__": if __name__ == "__main__":
args = args.parse_args() args = args.parse_args()
logger.info("use_gpu: {}, model_dir: {}, test_epoch: {}".format(args.use_gpu, args.model_dir, args.test_epoch))
run_infer(args) run_infer(args)
\ No newline at end of file
python infer.py --use_gpu 0 --model_dir './model_dir' --test_epoch 19 python infer.py --use_gpu 0 --model_dir ./model_dir --test_epoch 1
CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 --model_dir './model_dir' --test_epoch 19 CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 --model_dir ./model_dir --test_epoch 1
...@@ -10,6 +10,29 @@ from evaluator import BiRNN ...@@ -10,6 +10,29 @@ from evaluator import BiRNN
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid") logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
user_id = 0
class Dataset(object):
def _reader_creator(self):
def reader():
global user_id
user_slot_name = []
for j in range(args.batch_size):
user_slot_name.append([user_id])
user_id += 1
item_slot_name = np.random.randint(args.item_vocab, size=(args.batch_size, args.item_len)).tolist()
lenght = [args.item_len]*args.batch_size
label = np.random.randint(2, size=(args.batch_size, args.item_len)).tolist()
output = []
output.append(user_slot_name)
output.append(item_slot_name)
output.append(lenght)
output.append(label)
yield output
return reader
def get_train_data(self):
return self._reader_creator()
def train(args): def train(args):
...@@ -23,48 +46,32 @@ def train(args): ...@@ -23,48 +46,32 @@ def train(args):
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
train_data_generator = Dataset()
train_reader = fluid.io.batch(train_data_generator.get_train_data(), batch_size=args.batch_size)
loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=args.batch_size, iterable=True)
loader.set_sample_list_generator(train_reader, places=place)
# Build a random data set.
user_slot_names = []
item_slot_names = []
lens = []
labels = []
user_id = 0
for i in range(args.sample_size): for i in range(args.sample_size):
user_slot_name = [] for batch_id, data in enumerate(loader()):
for j in range(args.batch_size):
user_slot_name.append(user_id)
user_id += 1
user_slot_names.append(user_slot_name)
item_slot_name = np.random.randint(args.item_vocab, size=(args.batch_size, args.item_len))
item_slot_names.append(item_slot_name)
lenght = np.array([args.item_len]*args.batch_size)
lens.append(lenght)
label = np.random.randint(2, size=(args.batch_size, args.item_len))
labels.append(label)
for epoch in range(args.epochs):
for i in range(args.sample_size):
begin = time.time() begin = time.time()
loss_val, auc = exe.run(fluid.default_main_program(), loss_val, auc = exe.run(program=fluid.default_main_program(),
feed={ feed=data,
"user_slot_names": np.array(user_slot_names[i]).reshape(args.batch_size, 1), fetch_list=[loss.name, auc_val],
"item_slot_names": item_slot_names[i].astype('int64'), return_numpy=True)
"lens": lens[i].astype('int64'),
"labels": labels[i].astype('int64')
},
return_numpy=True,
fetch_list=[loss.name, auc_val])
end = time.time() end = time.time()
logger.info("epoch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format( logger.info("batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format(
epoch, end-begin, float(np.array(loss_val)), float(np.array(auc)))) batch_id, end-begin, float(np.array(loss_val)), float(np.array(auc))))
#save model #save model
model_dir = os.path.join(args.model_dir, 'epoch_' + str(epoch + 1), "checkpoint") model_dir = os.path.join(args.model_dir, 'epoch_' + str(1), "checkpoint")
main_program = fluid.default_main_program() main_program = fluid.default_main_program()
fluid.save(main_program, model_dir) fluid.save(main_program, model_dir)
if __name__ == "__main__": if __name__ == "__main__":
args = args.parse_args() args = args.parse_args()
logger.info("use_gpu: {}, batch_size: {}, model_dir: {}, embd_dim: {}, hidden_size: {}, item_vocab: {}, user_vocab: {},\
item_len: {}, sample_size: {}, base_lr: {}".format(args.use_gpu, args.batch_size, args.model_dir, args.embd_dim,
args.hidden_size, args.item_vocab, args.user_vocab, args.item_len, args.sample_size, args.base_lr))
train(args) train(args)
\ No newline at end of file
CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 0 --epochs 20 --batch_size 32 --model_dir './model_dir' --embd_dim 16 --hidden_size 128 --item_vocab 200 --user_vocab 200 --item_len 5 --sample_size 100 --base_lr 0.01 CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 0 --epochs 20 --batch_size 32 --model_dir ./model_dir --embd_dim 16 --hidden_size 128 --item_vocab 200 --user_vocab 200 --item_len 5 --sample_size 100 --base_lr 0.01
CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1 --epochs 20 --batch_size 32 --model_dir './model_dir' --embd_dim 16 --hidden_size 128 --item_vocab 200 --user_vocab 200 --item_len 5 --sample_size 100 --base_lr 0.01 CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1 --epochs 20 --batch_size 32 --model_dir ./model_dir --embd_dim 16 --hidden_size 128 --item_vocab 200 --user_vocab 200 --item_len 5 --sample_size 100 --base_lr 0.01
...@@ -12,7 +12,7 @@ def cos_sim(vector_a, vector_b): ...@@ -12,7 +12,7 @@ def cos_sim(vector_a, vector_b):
sim = 0.5 + 0.5 * cos sim = 0.5 + 0.5 * cos
return sim return sim
def get_topK(args, K): def get_topK(args):
video_vec = pd.read_csv(args.video_vec_path, header=None) video_vec = pd.read_csv(args.video_vec_path, header=None)
user_vec = pd.read_csv(args.user_vec_path, header=None) user_vec = pd.read_csv(args.user_vec_path, header=None)
...@@ -24,11 +24,11 @@ def get_topK(args, K): ...@@ -24,11 +24,11 @@ def get_topK(args, K):
tmp_list=copy.deepcopy(user_video_sim_list) tmp_list=copy.deepcopy(user_video_sim_list)
tmp_list.sort() tmp_list.sort()
max_sim_index=[user_video_sim_list.index(one) for one in tmp_list[::-1][:K]] max_sim_index=[user_video_sim_list.index(one) for one in tmp_list[::-1][:args.topk]]
print("user:{0}, top K videos:{1}".format(i, max_sim_index)) print("user:{0}, top K videos:{1}".format(i, max_sim_index))
user_video_sim_list = [] user_video_sim_list = []
if __name__ == "__main__": if __name__ == "__main__":
args = args.parse_args() args = args.parse_args()
get_topK(args, 5) get_topK(args)
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册