提交 c3f68339 编写于 作者: Y yudongxu(许煜东)

fix some bugs

上级 90fce5a7
...@@ -81,11 +81,11 @@ def build_model_columns(train_data_path, test_data_path): ...@@ -81,11 +81,11 @@ def build_model_columns(train_data_path, test_data_path):
train_df['label'] = train_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0) train_df['label'] = train_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0)
test_df['label'] = test_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0) test_df['label'] = test_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0)
with io.open('train_data/columns.txt','w') as f: with open('train_data/columns.txt','w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n' write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n'
f.write(write_str) f.write(write_str)
f.close() f.close()
with io.open('test_data/columns.txt','w') as f: with open('test_data/columns.txt','w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n' write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n'
f.write(write_str) f.write(write_str)
f.close() f.close()
......
...@@ -69,5 +69,7 @@ def run_infer(args,test_data_path): ...@@ -69,5 +69,7 @@ def run_infer(args,test_data_path):
if __name__ == "__main__": if __name__ == "__main__":
args = args.parse_args() args = args.parse_args()
logger.info("batch_size: {}, use_gpu: {}, test_epoch: {}, test_data_path: {}, model_dir:{}, hidden1_units: {}, hidden2_units: {}, hidden3_units: {}".format(
args.batch_size, args.use_gpu, args.test_epoch, args.test_data_path, args.model_dir, args.hidden1_units, args.hidden2_units, args.hidden3_units))
run_infer(args, args.test_data_path) run_infer(args, args.test_data_path)
\ No newline at end of file
...@@ -47,4 +47,6 @@ def train(args, train_data_path): ...@@ -47,4 +47,6 @@ def train(args, train_data_path):
if __name__ == "__main__": if __name__ == "__main__":
args = args.parse_args() args = args.parse_args()
logger.info("epoch:{}, batch_size: {}, use_gpu: {}, train_data_path: {}, model_dir: {}, hidden1_units: {}, hidden2_units: {}, hidden3_units: {}".format(
args.epoch, args.batch_size, args.use_gpu, args.train_data_path, args.model_dir, args.hidden1_units, args.hidden2_units, args.hidden3_units))
train(args, args.train_data_path) train(args, args.train_data_path)
...@@ -23,6 +23,16 @@ DSSM[《Learning Deep Structured Semantic Models for Web Search using Clickthrou ...@@ -23,6 +23,16 @@ DSSM[《Learning Deep Structured Semantic Models for Web Search using Clickthrou
python3.7 python3.7
## 数据集说明
由于论文没有公开数据集,本项目构造数据验证网络的正确性,其说明如下:
query:随机构造的query向量表示
doc_pos:随机构造doc正例向量表示
doc_neg_0~3为四个doc负例向量表示
## 单机训练 ## 单机训练
GPU环境 GPU环境
......
...@@ -57,7 +57,10 @@ def model(TRIGRAM_D = 1000, L1_N = 300, L2_N = 300, L3_N = 128, Neg = 4): ...@@ -57,7 +57,10 @@ def model(TRIGRAM_D = 1000, L1_N = 300, L2_N = 300, L3_N = 128, Neg = 4):
return avg_loss, R_Q_D_p, [query] + [doc_pos] + doc_negs return avg_loss, R_Q_D_p, [query] + [doc_pos] + doc_negs
args = args.parse_args() args = args.parse_args()
loss,R_Q_D_p, data_list = model(args.TRIGRAM_D,args.L1_N,args.L2_N,args.L3_N,args.Neg) logger.info("use_gpu: {}, batch_size: {}, TRIGRAM_D: {}, L1_N:{}, L2_N: {}, L3_N: {}, Neg: {}, base_lr: {}, model_dir: {}".format(
args.use_gpu, args.batch_size, args.TRIGRAM_D, args.L1_N, args.L2_N, args.L3_N, args.Neg, args.base_lr, args.model_dir))
loss,R_Q_D_p, data_list = model(args.TRIGRAM_D, args.L1_N, args.L2_N, args.L3_N, args.Neg)
sgd = fluid.optimizer.SGD(learning_rate=args.base_lr) sgd = fluid.optimizer.SGD(learning_rate=args.base_lr)
sgd.minimize(loss) sgd.minimize(loss)
......
...@@ -37,4 +37,6 @@ def infer(args): ...@@ -37,4 +37,6 @@ def infer(args):
if __name__ == "__main__": if __name__ == "__main__":
args = args.parse_args() args = args.parse_args()
logger.info("use_gpu: {}, model_dir: {}".format(args.use_gpu, args.model_dir))
infer(args) infer(args)
\ No newline at end of file
...@@ -96,6 +96,8 @@ GPU环境 ...@@ -96,6 +96,8 @@ GPU环境
```sh ```sh
python infer.py --use_gpu 1\ #是否使用gpu python infer.py --use_gpu 1\ #是否使用gpu
--batch_size 64\ #batch_size大小 --batch_size 64\ #batch_size大小
--cpu_num 2\ #cpu数量
--model_dir ./model_dir \ #模型保存路径
--test_data_path ./test_data \ #训练数据路径 --test_data_path ./test_data \ #训练数据路径
--vocab_path ./vocab_size.txt #embedding词汇表大小路径 --vocab_path ./vocab_size.txt #embedding词汇表大小路径
``` ```
...@@ -114,6 +116,7 @@ CPU环境 ...@@ -114,6 +116,7 @@ CPU环境
python infer.py --use_gpu 0\ #是否使用gpu python infer.py --use_gpu 0\ #是否使用gpu
--batch_size 64\ #batch_size大小 --batch_size 64\ #batch_size大小
--cpu_num 2\ #cpu数量 --cpu_num 2\ #cpu数量
--model_dir ./model_dir \ #模型保存路径
--test_data_path ./test_data \ #训练数据路径 --test_data_path ./test_data \ #训练数据路径
--vocab_path ./vocab_size.txt #embedding词汇表大小路径 --vocab_path ./vocab_size.txt #embedding词汇表大小路径
``` ```
......
python infer.py --use_gpu 0 \ python infer.py --use_gpu 0 \
--batch_size 64 \ --batch_size 64 \
--cpu_num 2 \ --cpu_num 2 \
--model_dir ./model_dir \
--test_data_path ./test_data \ --test_data_path ./test_data \
--vocab_path ./vocab_size.txt --vocab_path ./vocab_size.txt
\ No newline at end of file
...@@ -9,7 +9,7 @@ all_field_id_dict = defaultdict(int) ...@@ -9,7 +9,7 @@ all_field_id_dict = defaultdict(int)
for i,field_id in enumerate(all_field_id): for i,field_id in enumerate(all_field_id):
all_field_id_dict[field_id] = [False,i] all_field_id_dict[field_id] = [False,i]
class CriteoDataset(dg.MultiSlotStringDataGenerator): class Dataset(dg.MultiSlotStringDataGenerator):
def generate_sample(self, line): def generate_sample(self, line):
...@@ -40,5 +40,5 @@ class CriteoDataset(dg.MultiSlotStringDataGenerator): ...@@ -40,5 +40,5 @@ class CriteoDataset(dg.MultiSlotStringDataGenerator):
yield output yield output
return reader return reader
d = CriteoDataset() d = Dataset()
d.run_from_stdin() d.run_from_stdin()
\ No newline at end of file
python infer.py --use_gpu 1\ CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1\
--batch_size 64\ --batch_size 64\
--test_data_path ./test_data\ --cpu_num 2 \
--vocab_path ./vocab_size.txt --model_dir ./model_dir \
\ No newline at end of file --test_data_path ./test_data\
--vocab_path ./vocab_size.txt
\ No newline at end of file
CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu True\ CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1\
--epochs 100\ --epochs 100\
--batch_size 64\ --batch_size 64\
--embed_size 12\ --embed_size 12\
--cpu_num 2\ --cpu_num 2\
--model_dir './model_dir'\ --model_dir ./model_dir\
--train_data_path './train_data'\ --train_data_path ./train_data\
--vocab_path './vocab/vocab_size.txt' --vocab_path ./vocab/vocab_size.txt
\ No newline at end of file \ No newline at end of file
...@@ -33,7 +33,7 @@ def run_infer(args, model_path, test_data_path, vocab_size): ...@@ -33,7 +33,7 @@ def run_infer(args, model_path, test_data_path, vocab_size):
inputs = esmm_model.input_data() inputs = esmm_model.input_data()
avg_cost,auc_ctr,auc_ctcvr= esmm_model.net(inputs, vocab_size, args.embed_size) avg_cost,auc_ctr,auc_ctcvr= esmm_model.net(inputs, vocab_size, args.embed_size)
dataset, file_list = utils.get_dataset(inputs, test_data_path, args.batch_size,args.cpu_num) dataset, file_list = utils.get_dataset(inputs, test_data_path, args.batch_size, args.cpu_num)
exe = fluid.Executor(place) exe = fluid.Executor(place)
fluid.load(fluid.default_main_program(), os.path.join(model_path, "checkpoint"), exe) fluid.load(fluid.default_main_program(), os.path.join(model_path, "checkpoint"), exe)
...@@ -51,6 +51,9 @@ def run_infer(args, model_path, test_data_path, vocab_size): ...@@ -51,6 +51,9 @@ def run_infer(args, model_path, test_data_path, vocab_size):
if __name__ == "__main__": if __name__ == "__main__":
args = args.parse_args() args = args.parse_args()
logger.info("use_gpu: {}, epochs: {}, batch_size: {}, cpu_num: {}, model_dir: {}, test_data_path: {}, vocab_path: {}".format(args.use_gpu, args.epochs,
args.batch_size, args.cpu_num, args.model_dir, args.test_data_path, args.vocab_path))
model_list = [] model_list = []
for _, dir, _ in os.walk(args.model_dir): for _, dir, _ in os.walk(args.model_dir):
for model in dir: for model in dir:
...@@ -58,10 +61,10 @@ if __name__ == "__main__": ...@@ -58,10 +61,10 @@ if __name__ == "__main__":
path = os.path.join(args.model_dir, model) path = os.path.join(args.model_dir, model)
model_list.append(path) model_list.append(path)
vocab_size =utils.get_vocab_size(args.vocab_path) vocab_size = utils.get_vocab_size(args.vocab_path)
for model in model_list: for model in model_list:
logger.info("Test model {}".format(model)) logger.info("Test model {}".format(model))
run_infer(args, model,args.test_data_path) run_infer(args, model,args.test_data_path, vocab_size)
\ No newline at end of file
...@@ -5,6 +5,11 @@ from net import ESMM ...@@ -5,6 +5,11 @@ from net import ESMM
import paddle import paddle
import utils import utils
import args import args
import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
def train(args, vocab_size, train_data_path): def train(args, vocab_size, train_data_path):
esmm_model = ESMM() esmm_model = ESMM()
...@@ -39,5 +44,8 @@ def train(args, vocab_size, train_data_path): ...@@ -39,5 +44,8 @@ def train(args, vocab_size, train_data_path):
if __name__ == "__main__": if __name__ == "__main__":
args = args.parse_args() args = args.parse_args()
logger.info("use_gpu: {}, epochs: {}, batch_size: {}, embed_size: {}, cpu_num: {}, model_dir: {}, train_data_path: {}, vocab_path: {}".format(args.use_gpu, args.epochs,
args.batch_size, args.embed_size, args.cpu_num, args.model_dir, args.train_data_path, args.vocab_path))
vocab_size =utils.get_vocab_size(args.vocab_path) vocab_size =utils.get_vocab_size(args.vocab_path)
train(args, vocab_size, args.train_data_path) train(args, vocab_size, args.train_data_path)
...@@ -14,7 +14,7 @@ all_field_id_dict = defaultdict(int) ...@@ -14,7 +14,7 @@ all_field_id_dict = defaultdict(int)
for i,field_id in enumerate(all_field_id): for i,field_id in enumerate(all_field_id):
all_field_id_dict[field_id] = [False,i] all_field_id_dict[field_id] = [False,i]
def get_dataset(inputs,files,batch_size,cpu_num): def get_dataset(inputs, files,batch_size, cpu_num):
dataset = fluid.DatasetFactory().create_dataset() dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs) dataset.set_use_var(inputs)
dataset.set_pipe_command("python dataset_generator.py") dataset.set_pipe_command("python dataset_generator.py")
......
...@@ -25,7 +25,7 @@ _model_path = None ...@@ -25,7 +25,7 @@ _model_path = None
def run_infer(args, model_path, test_data_path): def run_infer(args, model_path, test_data_path):
test_data_generator = utils.CriteoDataset() test_data_generator = utils.Dataset()
with fluid.scope_guard(fluid.Scope()): with fluid.scope_guard(fluid.Scope()):
test_reader = fluid.io.batch( test_reader = fluid.io.batch(
......
...@@ -32,7 +32,7 @@ def get_train_data(filename, write_file, num_negatives): ...@@ -32,7 +32,7 @@ def get_train_data(filename, write_file, num_negatives):
file = open(write_file, 'w') file = open(write_file, 'w')
print("writing " + write_file) print("writing " + write_file)
for (u, i) in mat.keys(): for (u, i) in mat:
# positive instance # positive instance
user_input = str(u) user_input = str(u)
item_input = str(i) item_input = str(i)
......
...@@ -27,6 +27,20 @@ ...@@ -27,6 +27,20 @@
python3.7 python3.7
## 数据集说明
本项目构造数据集验证模型的正确性,字段说明如下:
user_slot_name:用户端特征群id
item_slot_name:item段特征群id
lenght:item的长度
label:用户对给定的是否点击item的list
注意:由于构造数据集的限制,本项目只用一个epoch,如果多个epoch,则多个epoch的数据是变化的,没有意义,因此只采用一个epoch。
## 单机训练 ## 单机训练
GPU环境 GPU环境
......
...@@ -22,7 +22,7 @@ import sys ...@@ -22,7 +22,7 @@ import sys
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--epochs", type=int, default=20, help="epochs") parser.add_argument("--epochs", type=int, default=1, help="epochs")
parser.add_argument("--batch_size", type=int, default=32, help="batch_size") parser.add_argument("--batch_size", type=int, default=32, help="batch_size")
parser.add_argument("--test_epoch", type=int, default=1, help="test_epoch") parser.add_argument("--test_epoch", type=int, default=1, help="test_epoch")
parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu') parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu')
......
...@@ -51,17 +51,17 @@ def train(args): ...@@ -51,17 +51,17 @@ def train(args):
train_reader = fluid.io.batch(train_data_generator.get_train_data(), batch_size=args.batch_size) train_reader = fluid.io.batch(train_data_generator.get_train_data(), batch_size=args.batch_size)
loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=args.batch_size, iterable=True) loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=args.batch_size, iterable=True)
loader.set_sample_list_generator(train_reader, places=place) loader.set_sample_list_generator(train_reader, places=place)
for epoch in range(args.epochs):
for i in range(args.sample_size): for i in range(args.sample_size):
for batch_id, data in enumerate(loader()): for batch_id, data in enumerate(loader()):
begin = time.time() begin = time.time()
loss_val, auc = exe.run(program=fluid.default_main_program(), loss_val, auc = exe.run(program=fluid.default_main_program(),
feed=data, feed=data,
fetch_list=[loss.name, auc_val], fetch_list=[loss.name, auc_val],
return_numpy=True) return_numpy=True)
end = time.time() end = time.time()
logger.info("batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format( logger.info("epoch: {}, batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format(
batch_id, end-begin, float(np.array(loss_val)), float(np.array(auc)))) epoch, batch_id, end-begin, float(np.array(loss_val)), float(np.array(auc))))
#save model #save model
model_dir = os.path.join(args.model_dir, 'epoch_' + str(1), "checkpoint") model_dir = os.path.join(args.model_dir, 'epoch_' + str(1), "checkpoint")
......
...@@ -34,6 +34,16 @@ ...@@ -34,6 +34,16 @@
python3.7 python3.7
## 数据集说明
由于原论文没有开源数据集,本项目随机构造数据,其字段如下:
watch_vecs:随机构造用户历史观看视频的embedding表示
search_vec:随机构造用户搜索历史的embedding表示
other_feat:随机构造其他特征的tembedding表示
## 单机训练 ## 单机训练
GPU环境 GPU环境
......
...@@ -48,6 +48,9 @@ def infer(args): ...@@ -48,6 +48,9 @@ def infer(args):
if __name__ == "__main__": if __name__ == "__main__":
args = args.parse_args() args = args.parse_args()
logger.info("use_gpu: {}, test_epoch: {}, model_dir: {}, user_vec_path: {}".format(
args.use_gpu, args.test_epoch, args.model_dir, args.user_vec_path))
if(os.path.exists(args.user_vec_path)): if(os.path.exists(args.user_vec_path)):
os.system("rm " + args.user_vec_path) os.system("rm " + args.user_vec_path)
infer(args) infer(args)
\ No newline at end of file
...@@ -70,6 +70,9 @@ def train(args): ...@@ -70,6 +70,9 @@ def train(args):
if __name__ == "__main__": if __name__ == "__main__":
args = args.parse_args() args = args.parse_args()
logger.info("use_gpu: {}, batch_size: {}, epochs: {}, watch_vec_size: {}, search_vec_size: {}, other_feat_size: {}, output_size: {}, model_dir: {}, test_epoch: {}, base_lr: {}, video_vec_path: {}".format(
args.use_gpu, args.batch_size, args.epochs, args.watch_vec_size, args.search_vec_size, args.other_feat_size, args.output_size, args.model_dir, args.test_epoch, args.base_lr, args.video_vec_path))
if(os.path.exists(args.video_vec_path)): if(os.path.exists(args.video_vec_path)):
os.system("rm " + args.video_vec_path) os.system("rm " + args.video_vec_path)
train(args) train(args)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册