提交 6ca686a2 编写于 作者: Q Qiao Longfei

change dataset from text8 to enwik9

上级 dee19f6e
...@@ -2,22 +2,7 @@ ...@@ -2,22 +2,7 @@
# DNN for Click-Through Rate prediction # DNN for Click-Through Rate prediction
## Introduction ## Introduction
This model implements the DNN part proposed in the following paper:
```text
@inproceedings{guo2017deepfm,
title={DeepFM: A Factorization-Machine based Neural Network for CTR Prediction},
author={Huifeng Guo, Ruiming Tang, Yunming Ye, Zhenguo Li and Xiuqiang He},
booktitle={the Twenty-Sixth International Joint Conference on Artificial Intelligence (IJCAI)},
pages={1725--1731},
year={2017}
}
```
The DeepFm combines factorization machine and deep neural networks to model
both low order and high order feature interactions. For details of the
factorization machines, please refer to the paper [factorization
machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
## Environment ## Environment
You should install PaddlePaddle Fluid first. You should install PaddlePaddle Fluid first.
...@@ -49,6 +34,10 @@ training dataset are splited such that 90% are used for training and the other ...@@ -49,6 +34,10 @@ training dataset are splited such that 90% are used for training and the other
10% are used for validation during training. In reader.py, training data is the first 10% are used for validation during training. In reader.py, training data is the first
90% of data in train.txt, and validation data is the left. 90% of data in train.txt, and validation data is the left.
```bash
python preprocess.py --data_path data/enwik9 --dict_path data/enwik9_dict
```
## Train ## Train
The command line options for training can be listed by `python train.py -h`. The command line options for training can be listed by `python train.py -h`.
......
#!/bin/bash
wget http://mattmahoney.net/dc/enwik9.zip
unzip enwik9.zip
# -*- coding: utf-8 -*
import re
import argparse
def parse_args():
parser = argparse.ArgumentParser(
description="Paddle Fluid word2 vector preprocess")
parser.add_argument(
'--data_path',
type=str,
required=True,
help="The path of training dataset")
parser.add_argument(
'--dict_path',
type=str,
default='./dict',
help="The path of generated dict")
parser.add_argument(
'--freq',
type=int,
default=5,
help="If the word count is less then freq, it will be removed from dict")
return parser.parse_args()
def preprocess(data_path, dict_path, freq):
"""
proprocess the data, generate dictionary and save into dict_path.
:param data_path: the input data path.
:param dict_path: the generated dict path. the data in dict is "word count"
:param freq:
:return:
"""
# word to count
word_count = dict()
with open(data_path) as f:
for line in f:
line = line.lower()
line = re.sub("[^0-9a-z ]", "", line)
words = line.split()
for item in words:
if item in word_count:
word_count[item] = word_count[item] + 1
else:
word_count[item] = 1
item_to_remove = []
for item in word_count:
if word_count[item] <= freq:
item_to_remove.append(item)
for item in item_to_remove:
del word_count[item]
with open(dict_path, 'w+') as f:
for k, v in word_count.items():
f.write(str(k) + " " + str(v) + '\n')
if __name__ == "__main__":
args = parse_args()
preprocess(args.data_path, args.dict_path, args.freq)
import argparse
import logging
import numpy as np
# disable gpu training for this example
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""
import paddle
import paddle.fluid as fluid
import reader
from network_conf import ctr_dnn_model
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
def parse_args():
parser = argparse.ArgumentParser(description="PaddlePaddle DeepFM example")
parser.add_argument(
'--model_path',
type=str,
required=True,
help="The path of model parameters gz file")
parser.add_argument(
'--data_path',
type=str,
required=True,
help="The path of the dataset to infer")
parser.add_argument(
'--embedding_size',
type=int,
default=10,
help="The size for embedding layer (default:10)")
parser.add_argument(
'--sparse_feature_dim',
type=int,
default=1000001,
help="The size for embedding layer (default:1000001)")
parser.add_argument(
'--batch_size',
type=int,
default=1000,
help="The size of mini-batch (default:1000)")
return parser.parse_args()
def infer():
args = parse_args()
place = fluid.CPUPlace()
inference_scope = fluid.core.Scope()
dataset = reader.CriteoDataset(args.sparse_feature_dim)
test_reader = paddle.batch(
dataset.test([args.data_path]), batch_size=args.batch_size)
startup_program = fluid.framework.Program()
test_program = fluid.framework.Program()
with fluid.framework.program_guard(test_program, startup_program):
loss, data_list, auc_var, batch_auc_var = ctr_dnn_model(
args.embedding_size, args.sparse_feature_dim)
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=data_list, place=place)
with fluid.scope_guard(inference_scope):
[inference_program, _, fetch_targets] = fluid.io.load_inference_model(
args.model_path, exe)
def set_zero(var_name):
param = inference_scope.var(var_name).get_tensor()
param_array = np.zeros(param._get_dims()).astype("int64")
param.set(param_array, place)
auc_states_names = ['_generated_var_2', '_generated_var_3']
for name in auc_states_names:
set_zero(name)
for batch_id, data in enumerate(test_reader()):
loss_val, auc_val = exe.run(inference_program,
feed=feeder.feed(data),
fetch_list=fetch_targets)
if batch_id % 100 == 0:
logger.info("TEST --> batch: {} loss: {} auc: {}".format(
batch_id, loss_val / args.batch_size, auc_val))
if __name__ == '__main__':
infer()
""" # -*- coding: utf-8 -*
Preprocess Criteo dataset. This dataset was used for the Display Advertising
Challenge (https://www.kaggle.com/c/criteo-display-ad-challenge). import re
""" import argparse
import os
import sys
import click def parse_args():
import random parser = argparse.ArgumentParser(
import collections description="Paddle Fluid word2 vector preprocess")
parser.add_argument(
# There are 13 integer features and 26 categorical features '--data_path',
continous_features = range(1, 14) type=str,
categorial_features = range(14, 40) required=True,
help="The path of training dataset")
# Clip integer features. The clip point for each integer feature parser.add_argument(
# is derived from the 95% quantile of the total values in each feature '--dict_path',
continous_clip = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50] type=str,
default='./dict',
help="The path of generated dict")
class CategoryDictGenerator: parser.add_argument(
'--freq',
type=int,
default=5,
help="If the word count is less then freq, it will be removed from dict")
return parser.parse_args()
def preprocess(data_path, dict_path, freq):
""" """
Generate dictionary for each of the categorical features proprocess the data, generate dictionary and save into dict_path.
:param data_path: the input data path.
:param dict_path: the generated dict path. the data in dict is "word count"
:param freq:
:return:
""" """
# word to count
word_count = dict()
def __init__(self, num_feature): with open(data_path) as f:
self.dicts = []
self.num_feature = num_feature
for i in range(0, num_feature):
self.dicts.append(collections.defaultdict(int))
def build(self, datafile, categorial_features, cutoff=0):
with open(datafile, 'r') as f:
for line in f: for line in f:
features = line.rstrip('\n').split('\t') line = line.lower()
for i in range(0, self.num_feature): line = re.sub("[^a-z ]", "", line)
if features[categorial_features[i]] != '': words = line.split()
self.dicts[i][features[categorial_features[i]]] += 1 for item in words:
for i in range(0, self.num_feature): if item in word_count:
self.dicts[i] = filter(lambda x: x[1] >= cutoff, word_count[item] = word_count[item] + 1
self.dicts[i].items())
self.dicts[i] = sorted(self.dicts[i], key=lambda x: (-x[1], x[0]))
vocabs, _ = list(zip(*self.dicts[i]))
self.dicts[i] = dict(zip(vocabs, range(1, len(vocabs) + 1)))
self.dicts[i]['<unk>'] = 0
def gen(self, idx, key):
if key not in self.dicts[idx]:
res = self.dicts[idx]['<unk>']
else: else:
res = self.dicts[idx][key] word_count[item] = 1
return res item_to_remove = []
for item in word_count:
def dicts_sizes(self): if word_count[item] <= freq:
return map(len, self.dicts) item_to_remove.append(item)
for item in item_to_remove:
del word_count[item]
class ContinuousFeatureGenerator:
"""
Normalize the integer features to [0, 1] by min-max normalization
"""
def __init__(self, num_feature):
self.num_feature = num_feature
self.min = [sys.maxint] * num_feature
self.max = [-sys.maxint] * num_feature
def build(self, datafile, continous_features):
with open(datafile, 'r') as f:
for line in f:
features = line.rstrip('\n').split('\t')
for i in range(0, self.num_feature):
val = features[continous_features[i]]
if val != '':
val = int(val)
if val > continous_clip[i]:
val = continous_clip[i]
self.min[i] = min(self.min[i], val)
self.max[i] = max(self.max[i], val)
def gen(self, idx, val):
if val == '':
return 0.0
val = float(val)
return (val - self.min[idx]) / (self.max[idx] - self.min[idx])
@click.command("preprocess")
@click.option("--datadir", type=str, help="Path to raw criteo dataset")
@click.option("--outdir", type=str, help="Path to save the processed data")
def preprocess(datadir, outdir):
"""
All 13 integer features are normalized to continuous values and these continuous
features are combined into one vector with dimension of 13.
Each of the 26 categorical features are one-hot encoded and all the one-hot
vectors are combined into one sparse binary vector.
"""
dists = ContinuousFeatureGenerator(len(continous_features))
dists.build(os.path.join(datadir, 'train.txt'), continous_features)
dicts = CategoryDictGenerator(len(categorial_features))
dicts.build(
os.path.join(datadir, 'train.txt'), categorial_features, cutoff=200)
dict_sizes = dicts.dicts_sizes()
categorial_feature_offset = [0]
for i in range(1, len(categorial_features)):
offset = categorial_feature_offset[i - 1] + dict_sizes[i - 1]
categorial_feature_offset.append(offset)
random.seed(0)
# 90% of the data are used for training, and 10% of the data are used
# for validation.
with open(os.path.join(outdir, 'train.txt'), 'w') as out_train:
with open(os.path.join(outdir, 'valid.txt'), 'w') as out_valid:
with open(os.path.join(datadir, 'train.txt'), 'r') as f:
for line in f:
features = line.rstrip('\n').split('\t')
continous_vals = []
for i in range(0, len(continous_features)):
val = dists.gen(i, features[continous_features[i]])
continous_vals.append("{0:.6f}".format(val).rstrip('0')
.rstrip('.'))
categorial_vals = []
for i in range(0, len(categorial_features)):
val = dicts.gen(i, features[categorial_features[
i]]) + categorial_feature_offset[i]
categorial_vals.append(str(val))
continous_vals = ','.join(continous_vals)
categorial_vals = ','.join(categorial_vals)
label = features[0]
if random.randint(0, 9999) % 10 != 0:
out_train.write('\t'.join(
[continous_vals, categorial_vals, label]) + '\n')
else:
out_valid.write('\t'.join(
[continous_vals, categorial_vals, label]) + '\n')
with open(os.path.join(outdir, 'test.txt'), 'w') as out:
with open(os.path.join(datadir, 'test.txt'), 'r') as f:
for line in f:
features = line.rstrip('\n').split('\t')
continous_vals = []
for i in range(0, len(continous_features)):
val = dists.gen(i, features[continous_features[i] - 1])
continous_vals.append("{0:.6f}".format(val).rstrip('0')
.rstrip('.'))
categorial_vals = []
for i in range(0, len(categorial_features)):
val = dicts.gen(i, features[categorial_features[
i] - 1]) + categorial_feature_offset[i]
categorial_vals.append(str(val))
continous_vals = ','.join(continous_vals) with open(dict_path, 'w+') as f:
categorial_vals = ','.join(categorial_vals) for k, v in word_count.items():
out.write('\t'.join([continous_vals, categorial_vals]) + '\n') f.write(str(k) + " " + str(v) + '\n')
if __name__ == "__main__": if __name__ == "__main__":
preprocess() args = parse_args()
preprocess(args.data_path, args.dict_path, args.freq)
# -*- coding: utf-8 -* # -*- coding: utf-8 -*
import time
import numpy as np import numpy as np
import random
from collections import Counter
""" """
refs: https://github.com/NELSONZHAO/zhihu/blob/master/skip_gram/Skip-Gram-English-Corpus.ipynb enwik9 dataset
text8 dataset http://mattmahoney.net/dc/enwik9.zip
http://mattmahoney.net/dc/textdata.html
""" """
with open('data/text8.txt') as f:
text = f.read()
def preprocess(text, freq=5):
'''
对文本进行预处理
参数
---
text: 文本数据
freq: 词频阈值
'''
# 对文本中的符号进行替换
text = text.lower()
text = text.replace('.', ' <PERIOD> ')
text = text.replace(',', ' <COMMA> ')
text = text.replace('"', ' <QUOTATION_MARK> ')
text = text.replace(';', ' <SEMICOLON> ')
text = text.replace('!', ' <EXCLAMATION_MARK> ')
text = text.replace('?', ' <QUESTION_MARK> ')
text = text.replace('(', ' <LEFT_PAREN> ')
text = text.replace(')', ' <RIGHT_PAREN> ')
text = text.replace('--', ' <HYPHENS> ')
text = text.replace('?', ' <QUESTION_MARK> ')
# text = text.replace('\n', ' <NEW_LINE> ')
text = text.replace(':', ' <COLON> ')
words = text.split()
# 删除低频词,减少噪音影响
word_counts = Counter(words)
trimmed_words = [word for word in words if word_counts[word] > freq]
return trimmed_words
# 清洗文本并分词
words = preprocess(text)
print(words[:20])
# 构建映射表
vocab = set(words)
vocab_to_int = {w: c for c, w in enumerate(vocab)}
dict_size = len(set(words))
print("total words: {}".format(len(words))) class Word2VecReader(object):
print("unique words: {}".format(dict_size)) def __init__(self, dict_path, data_path, window_size=5):
self.window_size_ = window_size
# 对原文本进行vocab到int的转换 self.data_path_ = data_path
int_words = [vocab_to_int[w] for w in words] self.word_to_id_ = dict()
t = 1e-5 # t值 word_id = 0
threshold = 0.8 # 剔除概率阈值 with open(dict_path, 'r') as f:
for line in f:
# # 统计单词出现频次 self.word_to_id_[line.split()[0]] = word_id
# int_word_counts = Counter(int_words) word_id += 1
# total_count = len(int_words) self.dict_size = len(self.word_to_id_)
# # 计算单词频率 print("dict_size = " + str(self.dict_size))
# word_freqs = {w: c/total_count for w, c in int_word_counts.items()}
# # 计算被删除的概率 def get_context_words(self, words, idx, window_size):
# prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts} """
# # 对单词进行采样 Get the context word list of target word.
# train_words = [w for w in int_words if prob_drop[w] < threshold]
words: the words of the current line
train_words = int_words idx: input word index
len(train_words) window_size: window size
"""
def get_targets(words, idx, window_size=5):
'''
获得input word的上下文单词列表
参数
---
words: 单词列表
idx: input word的索引号
window_size: 窗口大小
'''
target_window = np.random.randint(1, window_size + 1) target_window = np.random.randint(1, window_size + 1)
# 这里要考虑input word前面单词不够的情况 # need to keep in mind that maybe there are no enough words before the target word.
start_point = idx - target_window if (idx - target_window) > 0 else 0 start_point = idx - target_window if (idx - target_window) > 0 else 0
end_point = idx + target_window end_point = idx + target_window
# output words(即窗口中的上下文单词) # context words of the target word
targets = set(words[start_point:idx] + words[idx + 1:end_point + 1]) targets = set(words[start_point:idx] + words[idx + 1:end_point + 1])
return list(targets) return list(targets)
def train(self):
def get_batches(words, batch_size, window_size=5):
def _reader(): def _reader():
''' with open(self.data_path_, 'r') as f:
构造一个获取batch的生成器 for line in f:
''' word_ids = [
n_batches = len(words) // batch_size self.word_to_id_[word] for word in line.split()
if word in self.word_to_id_
# 仅取full batches ]
new_words = words[:n_batches * batch_size] for idx, target_id in enumerate(word_ids):
context_word_ids = self.get_context_words(
for idx in range(0, len(new_words), batch_size): word_ids, idx, self.window_size_)
x, y = [], [] for context_id in context_word_ids:
batch = new_words[idx:idx + batch_size] yield [target_id], [context_id]
for i in range(len(batch)):
batch_x = batch[i]
batch_y = get_targets(batch, i, window_size)
# 由于一个input word会对应多个output word,因此需要长度统一
x.extend([batch_x] * len(batch_y))
y.extend(batch_y)
for i in range(len(batch_y)):
yield [x[i]], [y[i]]
return _reader return _reader
if __name__ == "__main__": if __name__ == "__main__":
epochs = 10 # 迭代轮数 epochs = 10
batch_size = 1000 # batch大小 batch_size = 1000
window_size = 10 # 窗口大小 window_size = 10
batches = get_batches(train_words, batch_size, window_size) reader = Word2VecReader("data/enwik9_dict", "data/enwik9", window_size)
i = 0 i = 0
for x, y in batches(): for x, y in reader.train()():
print("x: " + str(x)) print("x: " + str(x))
print("y: " + str(y)) print("y: " + str(y))
print("\n") print("\n")
......
...@@ -23,12 +23,17 @@ def parse_args(): ...@@ -23,12 +23,17 @@ def parse_args():
parser.add_argument( parser.add_argument(
'--train_data_path', '--train_data_path',
type=str, type=str,
default='./data/raw/train.txt', default='./data/enwik9',
help="The path of training dataset") help="The path of training dataset")
parser.add_argument(
'--dict_path',
type=str,
default='./data/enwik9_dict',
help="The path of data dict")
parser.add_argument( parser.add_argument(
'--test_data_path', '--test_data_path',
type=str, type=str,
default='./data/raw/valid.txt', default='./data/text8',
help="The path of testing dataset") help="The path of testing dataset")
parser.add_argument( parser.add_argument(
'--batch_size', '--batch_size',
...@@ -86,11 +91,11 @@ def parse_args(): ...@@ -86,11 +91,11 @@ def parse_args():
return parser.parse_args() return parser.parse_args()
def train_loop(args, train_program, data_list, loss, trainer_num, trainer_id): def train_loop(args, train_program, reader, data_list, loss, trainer_num,
dataset = reader.get_batches(reader.train_words, 5, 5) trainer_id):
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
dataset, buf_size=args.batch_size * 100), reader.train(), buf_size=args.batch_size * 100),
batch_size=args.batch_size) batch_size=args.batch_size)
place = fluid.CPUPlace() place = fluid.CPUPlace()
...@@ -124,14 +129,17 @@ def train(): ...@@ -124,14 +129,17 @@ def train():
if not os.path.isdir(args.model_output_dir): if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir) os.mkdir(args.model_output_dir)
loss, data_list = skip_gram_word2vec(reader.dict_size, args.embedding_size) word2vec_reader = reader.Word2VecReader(args.dict_path,
args.train_data_path)
loss, data_list = skip_gram_word2vec(word2vec_reader.dict_size,
args.embedding_size)
optimizer = fluid.optimizer.Adam(learning_rate=1e-3) optimizer = fluid.optimizer.Adam(learning_rate=1e-3)
optimizer.minimize(loss) optimizer.minimize(loss)
if args.is_local: if args.is_local:
logger.info("run local training") logger.info("run local training")
main_program = fluid.default_main_program() main_program = fluid.default_main_program()
train_loop(args, main_program, data_list, loss, 1, -1) train_loop(args, main_program, word2vec_reader, data_list, loss, 1, -1)
else: else:
logger.info("run dist training") logger.info("run dist training")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
...@@ -148,8 +156,8 @@ def train(): ...@@ -148,8 +156,8 @@ def train():
elif args.role == "trainer": elif args.role == "trainer":
logger.info("run trainer") logger.info("run trainer")
train_prog = t.get_trainer_program() train_prog = t.get_trainer_program()
train_loop(args, train_prog, data_list, loss, args.trainers, train_loop(args, train_prog, word2vec_reader, data_list, loss,
args.trainer_id + 1) args.trainers, args.trainer_id + 1)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册