From 6ca686a2fab670a97ec1008ec3e46057a1b917cc Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 11 Nov 2018 20:46:28 +0800 Subject: [PATCH] change dataset from text8 to enwik9 --- fluid/PaddleRec/word2vec/README.md | 19 +- fluid/PaddleRec/word2vec/data/download.sh | 5 + fluid/PaddleRec/word2vec/data/preprocess.py | 64 ------ fluid/PaddleRec/word2vec/infer.py | 93 --------- fluid/PaddleRec/word2vec/preprocess.py | 218 ++++++-------------- fluid/PaddleRec/word2vec/reader.py | 174 +++++----------- fluid/PaddleRec/word2vec/train.py | 26 ++- fluid/__init__.py | 0 8 files changed, 137 insertions(+), 462 deletions(-) create mode 100644 fluid/PaddleRec/word2vec/data/download.sh delete mode 100644 fluid/PaddleRec/word2vec/data/preprocess.py mode change 100755 => 100644 fluid/PaddleRec/word2vec/preprocess.py create mode 100644 fluid/__init__.py diff --git a/fluid/PaddleRec/word2vec/README.md b/fluid/PaddleRec/word2vec/README.md index 8f648534..1da28c43 100644 --- a/fluid/PaddleRec/word2vec/README.md +++ b/fluid/PaddleRec/word2vec/README.md @@ -2,22 +2,7 @@ # DNN for Click-Through Rate prediction ## Introduction -This model implements the DNN part proposed in the following paper: - -```text -@inproceedings{guo2017deepfm, - title={DeepFM: A Factorization-Machine based Neural Network for CTR Prediction}, - author={Huifeng Guo, Ruiming Tang, Yunming Ye, Zhenguo Li and Xiuqiang He}, - booktitle={the Twenty-Sixth International Joint Conference on Artificial Intelligence (IJCAI)}, - pages={1725--1731}, - year={2017} -} -``` -The DeepFm combines factorization machine and deep neural networks to model -both low order and high order feature interactions. For details of the -factorization machines, please refer to the paper [factorization -machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf) ## Environment You should install PaddlePaddle Fluid first. @@ -49,6 +34,10 @@ training dataset are splited such that 90% are used for training and the other 10% are used for validation during training. In reader.py, training data is the first 90% of data in train.txt, and validation data is the left. +```bash +python preprocess.py --data_path data/enwik9 --dict_path data/enwik9_dict +``` + ## Train The command line options for training can be listed by `python train.py -h`. diff --git a/fluid/PaddleRec/word2vec/data/download.sh b/fluid/PaddleRec/word2vec/data/download.sh new file mode 100644 index 00000000..0ae2baff --- /dev/null +++ b/fluid/PaddleRec/word2vec/data/download.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +wget http://mattmahoney.net/dc/enwik9.zip +unzip enwik9.zip + diff --git a/fluid/PaddleRec/word2vec/data/preprocess.py b/fluid/PaddleRec/word2vec/data/preprocess.py deleted file mode 100644 index 961a35e1..00000000 --- a/fluid/PaddleRec/word2vec/data/preprocess.py +++ /dev/null @@ -1,64 +0,0 @@ -# -*- coding: utf-8 -* - -import re -import argparse - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Paddle Fluid word2 vector preprocess") - parser.add_argument( - '--data_path', - type=str, - required=True, - help="The path of training dataset") - parser.add_argument( - '--dict_path', - type=str, - default='./dict', - help="The path of generated dict") - parser.add_argument( - '--freq', - type=int, - default=5, - help="If the word count is less then freq, it will be removed from dict") - - return parser.parse_args() - - -def preprocess(data_path, dict_path, freq): - """ - proprocess the data, generate dictionary and save into dict_path. - :param data_path: the input data path. - :param dict_path: the generated dict path. the data in dict is "word count" - :param freq: - :return: - """ - # word to count - word_count = dict() - - with open(data_path) as f: - for line in f: - line = line.lower() - line = re.sub("[^0-9a-z ]", "", line) - words = line.split() - for item in words: - if item in word_count: - word_count[item] = word_count[item] + 1 - else: - word_count[item] = 1 - item_to_remove = [] - for item in word_count: - if word_count[item] <= freq: - item_to_remove.append(item) - for item in item_to_remove: - del word_count[item] - - with open(dict_path, 'w+') as f: - for k, v in word_count.items(): - f.write(str(k) + " " + str(v) + '\n') - - -if __name__ == "__main__": - args = parse_args() - preprocess(args.data_path, args.dict_path, args.freq) diff --git a/fluid/PaddleRec/word2vec/infer.py b/fluid/PaddleRec/word2vec/infer.py index d353b770..e69de29b 100644 --- a/fluid/PaddleRec/word2vec/infer.py +++ b/fluid/PaddleRec/word2vec/infer.py @@ -1,93 +0,0 @@ -import argparse -import logging - -import numpy as np -# disable gpu training for this example -import os -os.environ["CUDA_VISIBLE_DEVICES"] = "" -import paddle -import paddle.fluid as fluid - -import reader -from network_conf import ctr_dnn_model - -logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger("fluid") -logger.setLevel(logging.INFO) - - -def parse_args(): - parser = argparse.ArgumentParser(description="PaddlePaddle DeepFM example") - parser.add_argument( - '--model_path', - type=str, - required=True, - help="The path of model parameters gz file") - parser.add_argument( - '--data_path', - type=str, - required=True, - help="The path of the dataset to infer") - parser.add_argument( - '--embedding_size', - type=int, - default=10, - help="The size for embedding layer (default:10)") - parser.add_argument( - '--sparse_feature_dim', - type=int, - default=1000001, - help="The size for embedding layer (default:1000001)") - parser.add_argument( - '--batch_size', - type=int, - default=1000, - help="The size of mini-batch (default:1000)") - - return parser.parse_args() - - -def infer(): - args = parse_args() - - place = fluid.CPUPlace() - inference_scope = fluid.core.Scope() - - dataset = reader.CriteoDataset(args.sparse_feature_dim) - test_reader = paddle.batch( - dataset.test([args.data_path]), batch_size=args.batch_size) - - startup_program = fluid.framework.Program() - test_program = fluid.framework.Program() - with fluid.framework.program_guard(test_program, startup_program): - loss, data_list, auc_var, batch_auc_var = ctr_dnn_model( - args.embedding_size, args.sparse_feature_dim) - - exe = fluid.Executor(place) - - feeder = fluid.DataFeeder(feed_list=data_list, place=place) - - with fluid.scope_guard(inference_scope): - [inference_program, _, fetch_targets] = fluid.io.load_inference_model( - args.model_path, exe) - - def set_zero(var_name): - param = inference_scope.var(var_name).get_tensor() - param_array = np.zeros(param._get_dims()).astype("int64") - param.set(param_array, place) - - auc_states_names = ['_generated_var_2', '_generated_var_3'] - for name in auc_states_names: - set_zero(name) - - for batch_id, data in enumerate(test_reader()): - loss_val, auc_val = exe.run(inference_program, - feed=feeder.feed(data), - fetch_list=fetch_targets) - if batch_id % 100 == 0: - logger.info("TEST --> batch: {} loss: {} auc: {}".format( - batch_id, loss_val / args.batch_size, auc_val)) - - -if __name__ == '__main__': - infer() diff --git a/fluid/PaddleRec/word2vec/preprocess.py b/fluid/PaddleRec/word2vec/preprocess.py old mode 100755 new mode 100644 index bf5673ba..0d1f58c0 --- a/fluid/PaddleRec/word2vec/preprocess.py +++ b/fluid/PaddleRec/word2vec/preprocess.py @@ -1,164 +1,64 @@ -""" -Preprocess Criteo dataset. This dataset was used for the Display Advertising -Challenge (https://www.kaggle.com/c/criteo-display-ad-challenge). -""" -import os -import sys -import click -import random -import collections - -# There are 13 integer features and 26 categorical features -continous_features = range(1, 14) -categorial_features = range(14, 40) - -# Clip integer features. The clip point for each integer feature -# is derived from the 95% quantile of the total values in each feature -continous_clip = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50] - - -class CategoryDictGenerator: +# -*- coding: utf-8 -* + +import re +import argparse + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Paddle Fluid word2 vector preprocess") + parser.add_argument( + '--data_path', + type=str, + required=True, + help="The path of training dataset") + parser.add_argument( + '--dict_path', + type=str, + default='./dict', + help="The path of generated dict") + parser.add_argument( + '--freq', + type=int, + default=5, + help="If the word count is less then freq, it will be removed from dict") + + return parser.parse_args() + + +def preprocess(data_path, dict_path, freq): """ - Generate dictionary for each of the categorical features + proprocess the data, generate dictionary and save into dict_path. + :param data_path: the input data path. + :param dict_path: the generated dict path. the data in dict is "word count" + :param freq: + :return: """ - - def __init__(self, num_feature): - self.dicts = [] - self.num_feature = num_feature - for i in range(0, num_feature): - self.dicts.append(collections.defaultdict(int)) - - def build(self, datafile, categorial_features, cutoff=0): - with open(datafile, 'r') as f: - for line in f: - features = line.rstrip('\n').split('\t') - for i in range(0, self.num_feature): - if features[categorial_features[i]] != '': - self.dicts[i][features[categorial_features[i]]] += 1 - for i in range(0, self.num_feature): - self.dicts[i] = filter(lambda x: x[1] >= cutoff, - self.dicts[i].items()) - self.dicts[i] = sorted(self.dicts[i], key=lambda x: (-x[1], x[0])) - vocabs, _ = list(zip(*self.dicts[i])) - self.dicts[i] = dict(zip(vocabs, range(1, len(vocabs) + 1))) - self.dicts[i][''] = 0 - - def gen(self, idx, key): - if key not in self.dicts[idx]: - res = self.dicts[idx][''] - else: - res = self.dicts[idx][key] - return res - - def dicts_sizes(self): - return map(len, self.dicts) - - -class ContinuousFeatureGenerator: - """ - Normalize the integer features to [0, 1] by min-max normalization - """ - - def __init__(self, num_feature): - self.num_feature = num_feature - self.min = [sys.maxint] * num_feature - self.max = [-sys.maxint] * num_feature - - def build(self, datafile, continous_features): - with open(datafile, 'r') as f: - for line in f: - features = line.rstrip('\n').split('\t') - for i in range(0, self.num_feature): - val = features[continous_features[i]] - if val != '': - val = int(val) - if val > continous_clip[i]: - val = continous_clip[i] - self.min[i] = min(self.min[i], val) - self.max[i] = max(self.max[i], val) - - def gen(self, idx, val): - if val == '': - return 0.0 - val = float(val) - return (val - self.min[idx]) / (self.max[idx] - self.min[idx]) - - -@click.command("preprocess") -@click.option("--datadir", type=str, help="Path to raw criteo dataset") -@click.option("--outdir", type=str, help="Path to save the processed data") -def preprocess(datadir, outdir): - """ - All 13 integer features are normalized to continuous values and these continuous - features are combined into one vector with dimension of 13. - - Each of the 26 categorical features are one-hot encoded and all the one-hot - vectors are combined into one sparse binary vector. - """ - dists = ContinuousFeatureGenerator(len(continous_features)) - dists.build(os.path.join(datadir, 'train.txt'), continous_features) - - dicts = CategoryDictGenerator(len(categorial_features)) - dicts.build( - os.path.join(datadir, 'train.txt'), categorial_features, cutoff=200) - - dict_sizes = dicts.dicts_sizes() - categorial_feature_offset = [0] - for i in range(1, len(categorial_features)): - offset = categorial_feature_offset[i - 1] + dict_sizes[i - 1] - categorial_feature_offset.append(offset) - - random.seed(0) - - # 90% of the data are used for training, and 10% of the data are used - # for validation. - with open(os.path.join(outdir, 'train.txt'), 'w') as out_train: - with open(os.path.join(outdir, 'valid.txt'), 'w') as out_valid: - with open(os.path.join(datadir, 'train.txt'), 'r') as f: - for line in f: - features = line.rstrip('\n').split('\t') - - continous_vals = [] - for i in range(0, len(continous_features)): - val = dists.gen(i, features[continous_features[i]]) - continous_vals.append("{0:.6f}".format(val).rstrip('0') - .rstrip('.')) - categorial_vals = [] - for i in range(0, len(categorial_features)): - val = dicts.gen(i, features[categorial_features[ - i]]) + categorial_feature_offset[i] - categorial_vals.append(str(val)) - - continous_vals = ','.join(continous_vals) - categorial_vals = ','.join(categorial_vals) - label = features[0] - if random.randint(0, 9999) % 10 != 0: - out_train.write('\t'.join( - [continous_vals, categorial_vals, label]) + '\n') - else: - out_valid.write('\t'.join( - [continous_vals, categorial_vals, label]) + '\n') - - with open(os.path.join(outdir, 'test.txt'), 'w') as out: - with open(os.path.join(datadir, 'test.txt'), 'r') as f: - for line in f: - features = line.rstrip('\n').split('\t') - - continous_vals = [] - for i in range(0, len(continous_features)): - val = dists.gen(i, features[continous_features[i] - 1]) - continous_vals.append("{0:.6f}".format(val).rstrip('0') - .rstrip('.')) - categorial_vals = [] - for i in range(0, len(categorial_features)): - val = dicts.gen(i, features[categorial_features[ - i] - 1]) + categorial_feature_offset[i] - categorial_vals.append(str(val)) - - continous_vals = ','.join(continous_vals) - categorial_vals = ','.join(categorial_vals) - out.write('\t'.join([continous_vals, categorial_vals]) + '\n') + # word to count + word_count = dict() + + with open(data_path) as f: + for line in f: + line = line.lower() + line = re.sub("[^a-z ]", "", line) + words = line.split() + for item in words: + if item in word_count: + word_count[item] = word_count[item] + 1 + else: + word_count[item] = 1 + item_to_remove = [] + for item in word_count: + if word_count[item] <= freq: + item_to_remove.append(item) + for item in item_to_remove: + del word_count[item] + + with open(dict_path, 'w+') as f: + for k, v in word_count.items(): + f.write(str(k) + " " + str(v) + '\n') if __name__ == "__main__": - preprocess() + args = parse_args() + preprocess(args.data_path, args.dict_path, args.freq) diff --git a/fluid/PaddleRec/word2vec/reader.py b/fluid/PaddleRec/word2vec/reader.py index 7852b74e..ca00dae5 100644 --- a/fluid/PaddleRec/word2vec/reader.py +++ b/fluid/PaddleRec/word2vec/reader.py @@ -1,138 +1,68 @@ # -*- coding: utf-8 -* -import time import numpy as np -import random -from collections import Counter """ -refs: https://github.com/NELSONZHAO/zhihu/blob/master/skip_gram/Skip-Gram-English-Corpus.ipynb +enwik9 dataset -text8 dataset - -http://mattmahoney.net/dc/textdata.html +http://mattmahoney.net/dc/enwik9.zip """ -with open('data/text8.txt') as f: - text = f.read() - - -def preprocess(text, freq=5): - ''' - 对文本进行预处理 - - 参数 - --- - text: 文本数据 - freq: 词频阈值 - ''' - # 对文本中的符号进行替换 - text = text.lower() - text = text.replace('.', ' ') - text = text.replace(',', ' ') - text = text.replace('"', ' ') - text = text.replace(';', ' ') - text = text.replace('!', ' ') - text = text.replace('?', ' ') - text = text.replace('(', ' ') - text = text.replace(')', ' ') - text = text.replace('--', ' ') - text = text.replace('?', ' ') - # text = text.replace('\n', ' ') - text = text.replace(':', ' ') - words = text.split() - - # 删除低频词,减少噪音影响 - word_counts = Counter(words) - trimmed_words = [word for word in words if word_counts[word] > freq] - - return trimmed_words - - -# 清洗文本并分词 -words = preprocess(text) -print(words[:20]) - -# 构建映射表 -vocab = set(words) -vocab_to_int = {w: c for c, w in enumerate(vocab)} - -dict_size = len(set(words)) - -print("total words: {}".format(len(words))) -print("unique words: {}".format(dict_size)) - -# 对原文本进行vocab到int的转换 -int_words = [vocab_to_int[w] for w in words] - -t = 1e-5 # t值 -threshold = 0.8 # 剔除概率阈值 - -# # 统计单词出现频次 -# int_word_counts = Counter(int_words) -# total_count = len(int_words) -# # 计算单词频率 -# word_freqs = {w: c/total_count for w, c in int_word_counts.items()} -# # 计算被删除的概率 -# prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts} -# # 对单词进行采样 -# train_words = [w for w in int_words if prob_drop[w] < threshold] - -train_words = int_words -len(train_words) - - -def get_targets(words, idx, window_size=5): - ''' - 获得input word的上下文单词列表 - - 参数 - --- - words: 单词列表 - idx: input word的索引号 - window_size: 窗口大小 - ''' - target_window = np.random.randint(1, window_size + 1) - # 这里要考虑input word前面单词不够的情况 - start_point = idx - target_window if (idx - target_window) > 0 else 0 - end_point = idx + target_window - # output words(即窗口中的上下文单词) - targets = set(words[start_point:idx] + words[idx + 1:end_point + 1]) - return list(targets) - - -def get_batches(words, batch_size, window_size=5): - def _reader(): - ''' - 构造一个获取batch的生成器 - ''' - n_batches = len(words) // batch_size - - # 仅取full batches - new_words = words[:n_batches * batch_size] - - for idx in range(0, len(new_words), batch_size): - x, y = [], [] - batch = new_words[idx:idx + batch_size] - for i in range(len(batch)): - batch_x = batch[i] - batch_y = get_targets(batch, i, window_size) - # 由于一个input word会对应多个output word,因此需要长度统一 - x.extend([batch_x] * len(batch_y)) - y.extend(batch_y) - for i in range(len(batch_y)): - yield [x[i]], [y[i]] - return _reader +class Word2VecReader(object): + def __init__(self, dict_path, data_path, window_size=5): + self.window_size_ = window_size + self.data_path_ = data_path + self.word_to_id_ = dict() + + word_id = 0 + with open(dict_path, 'r') as f: + for line in f: + self.word_to_id_[line.split()[0]] = word_id + word_id += 1 + self.dict_size = len(self.word_to_id_) + print("dict_size = " + str(self.dict_size)) + + def get_context_words(self, words, idx, window_size): + """ + Get the context word list of target word. + + words: the words of the current line + idx: input word index + window_size: window size + """ + target_window = np.random.randint(1, window_size + 1) + # need to keep in mind that maybe there are no enough words before the target word. + start_point = idx - target_window if (idx - target_window) > 0 else 0 + end_point = idx + target_window + # context words of the target word + targets = set(words[start_point:idx] + words[idx + 1:end_point + 1]) + return list(targets) + + def train(self): + def _reader(): + with open(self.data_path_, 'r') as f: + for line in f: + word_ids = [ + self.word_to_id_[word] for word in line.split() + if word in self.word_to_id_ + ] + for idx, target_id in enumerate(word_ids): + context_word_ids = self.get_context_words( + word_ids, idx, self.window_size_) + for context_id in context_word_ids: + yield [target_id], [context_id] + + return _reader if __name__ == "__main__": - epochs = 10 # 迭代轮数 - batch_size = 1000 # batch大小 - window_size = 10 # 窗口大小 + epochs = 10 + batch_size = 1000 + window_size = 10 - batches = get_batches(train_words, batch_size, window_size) + reader = Word2VecReader("data/enwik9_dict", "data/enwik9", window_size) i = 0 - for x, y in batches(): + for x, y in reader.train()(): print("x: " + str(x)) print("y: " + str(y)) print("\n") diff --git a/fluid/PaddleRec/word2vec/train.py b/fluid/PaddleRec/word2vec/train.py index 2932cf54..d0bc2393 100644 --- a/fluid/PaddleRec/word2vec/train.py +++ b/fluid/PaddleRec/word2vec/train.py @@ -23,12 +23,17 @@ def parse_args(): parser.add_argument( '--train_data_path', type=str, - default='./data/raw/train.txt', + default='./data/enwik9', help="The path of training dataset") + parser.add_argument( + '--dict_path', + type=str, + default='./data/enwik9_dict', + help="The path of data dict") parser.add_argument( '--test_data_path', type=str, - default='./data/raw/valid.txt', + default='./data/text8', help="The path of testing dataset") parser.add_argument( '--batch_size', @@ -86,11 +91,11 @@ def parse_args(): return parser.parse_args() -def train_loop(args, train_program, data_list, loss, trainer_num, trainer_id): - dataset = reader.get_batches(reader.train_words, 5, 5) +def train_loop(args, train_program, reader, data_list, loss, trainer_num, + trainer_id): train_reader = paddle.batch( paddle.reader.shuffle( - dataset, buf_size=args.batch_size * 100), + reader.train(), buf_size=args.batch_size * 100), batch_size=args.batch_size) place = fluid.CPUPlace() @@ -124,14 +129,17 @@ def train(): if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) - loss, data_list = skip_gram_word2vec(reader.dict_size, args.embedding_size) + word2vec_reader = reader.Word2VecReader(args.dict_path, + args.train_data_path) + loss, data_list = skip_gram_word2vec(word2vec_reader.dict_size, + args.embedding_size) optimizer = fluid.optimizer.Adam(learning_rate=1e-3) optimizer.minimize(loss) if args.is_local: logger.info("run local training") main_program = fluid.default_main_program() - train_loop(args, main_program, data_list, loss, 1, -1) + train_loop(args, main_program, word2vec_reader, data_list, loss, 1, -1) else: logger.info("run dist training") t = fluid.DistributeTranspiler() @@ -148,8 +156,8 @@ def train(): elif args.role == "trainer": logger.info("run trainer") train_prog = t.get_trainer_program() - train_loop(args, train_prog, data_list, loss, args.trainers, - args.trainer_id + 1) + train_loop(args, train_prog, word2vec_reader, data_list, loss, + args.trainers, args.trainer_id + 1) if __name__ == '__main__': diff --git a/fluid/__init__.py b/fluid/__init__.py new file mode 100644 index 00000000..e69de29b -- GitLab