change dataset from text8 to enwik9

6ca686a2 · Qiao Longfei · dee19f6e · 6ca686a2 · 6ca686a2 · dee19f6e
8 changed file
--- a/fluid/PaddleRec/word2vec/README.md
+++ b/fluid/PaddleRec/word2vec/README.md
@@ -2,22 +2,7 @@
 # DNN for Click-Through Rate prediction
 ## Introduction
-This model implements the DNN part proposed in the following paper:
-```text
-@inproceedings{guo2017deepfm,
-  title={DeepFM: A Factorization-Machine based Neural Network for CTR Prediction},
-  author={Huifeng Guo, Ruiming Tang, Yunming Ye, Zhenguo Li and Xiuqiang He},
-  booktitle={the Twenty-Sixth International Joint Conference on Artificial Intelligence (IJCAI)},
-  pages={1725--1731},
-  year={2017}
-}
-```
-The DeepFm combines factorization machine and deep neural networks to model
-both low order and high order feature interactions. For details of the
-factorization machines, please refer to the paper [factorization
-machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
 ## Environment
 You should install PaddlePaddle Fluid first.
@@ -49,6 +34,10 @@ training dataset are splited such that 90% are used for training and the other
 10% are used for validation during training. In reader.py, training data is the first
 90% of data in train.txt, and validation data is the left.
+```bash
+python preprocess.py --data_path data/enwik9 --dict_path data/enwik9_dict
+```
 ## Train
 The command line options for training can be listed by `python train.py -h`.

--- a/fluid/PaddleRec/word2vec/data/download.sh
+++ b/fluid/PaddleRec/word2vec/data/download.sh
+#!/bin/bash
+wget http://mattmahoney.net/dc/enwik9.zip
+unzip enwik9.zip
--- a/fluid/PaddleRec/word2vec/data/preprocess.py
+++ b/fluid/PaddleRec/word2vec/data/preprocess.py
-# -*- coding: utf-8 -*
-import re
-import argparse
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Paddle Fluid word2 vector preprocess")
-    parser.add_argument(
-        '--data_path',
-        type=str,
-        required=True,
-        help="The path of training dataset")
-    parser.add_argument(
-        '--dict_path',
-        type=str,
-        default='./dict',
-        help="The path of generated dict")
-    parser.add_argument(
-        '--freq',
-        type=int,
-        default=5,
-        help="If the word count is less then freq, it will be removed from dict")
-    return parser.parse_args()
-def preprocess(data_path, dict_path, freq):
-    """
-    proprocess the data, generate dictionary and save into dict_path.
-    :param data_path: the input data path.
-    :param dict_path: the generated dict path. the data in dict is "word count"
-    :param freq:
-    :return:
-    """
-    # word to count
-    word_count = dict()
-    with open(data_path) as f:
-        for line in f:
-            line = line.lower()
-            line = re.sub("[^0-9a-z ]", "", line)
-            words = line.split()
-            for item in words:
-                if item in word_count:
-                    word_count[item] = word_count[item] + 1
-                else:
-                    word_count[item] = 1
-    item_to_remove = []
-    for item in word_count:
-        if word_count[item] <= freq:
-            item_to_remove.append(item)
-    for item in item_to_remove:
-        del word_count[item]
-    with open(dict_path, 'w+') as f:
-        for k, v in word_count.items():
-            f.write(str(k) + " " + str(v) + '\n')
-if __name__ == "__main__":
-    args = parse_args()
-    preprocess(args.data_path, args.dict_path, args.freq)
--- a/fluid/PaddleRec/word2vec/infer.py
+++ b/fluid/PaddleRec/word2vec/infer.py
-import argparse
-import logging
-import numpy as np
-# disable gpu training for this example 
-import os
-os.environ["CUDA_VISIBLE_DEVICES"] = ""
-import paddle
-import paddle.fluid as fluid
-import reader
-from network_conf import ctr_dnn_model
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-def parse_args():
-    parser = argparse.ArgumentParser(description="PaddlePaddle DeepFM example")
-    parser.add_argument(
-        '--model_path',
-        type=str,
-        required=True,
-        help="The path of model parameters gz file")
-    parser.add_argument(
-        '--data_path',
-        type=str,
-        required=True,
-        help="The path of the dataset to infer")
-    parser.add_argument(
-        '--embedding_size',
-        type=int,
-        default=10,
-        help="The size for embedding layer (default:10)")
-    parser.add_argument(
-        '--sparse_feature_dim',
-        type=int,
-        default=1000001,
-        help="The size for embedding layer (default:1000001)")
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=1000,
-        help="The size of mini-batch (default:1000)")
-    return parser.parse_args()
-def infer():
-    args = parse_args()
-    place = fluid.CPUPlace()
-    inference_scope = fluid.core.Scope()
-    dataset = reader.CriteoDataset(args.sparse_feature_dim)
-    test_reader = paddle.batch(
-        dataset.test([args.data_path]), batch_size=args.batch_size)
-    startup_program = fluid.framework.Program()
-    test_program = fluid.framework.Program()
-    with fluid.framework.program_guard(test_program, startup_program):
-        loss, data_list, auc_var, batch_auc_var = ctr_dnn_model(
-            args.embedding_size, args.sparse_feature_dim)
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(feed_list=data_list, place=place)
-    with fluid.scope_guard(inference_scope):
-        [inference_program, _, fetch_targets] = fluid.io.load_inference_model(
-            args.model_path, exe)
-        def set_zero(var_name):
-            param = inference_scope.var(var_name).get_tensor()
-            param_array = np.zeros(param._get_dims()).astype("int64")
-            param.set(param_array, place)
-        auc_states_names = ['_generated_var_2', '_generated_var_3']
-        for name in auc_states_names:
-            set_zero(name)
-        for batch_id, data in enumerate(test_reader()):
-            loss_val, auc_val = exe.run(inference_program,
-                                        feed=feeder.feed(data),
-                                        fetch_list=fetch_targets)
-            if batch_id % 100 == 0:
-                logger.info("TEST --> batch: {} loss: {} auc: {}".format(
-                    batch_id, loss_val / args.batch_size, auc_val))
-if __name__ == '__main__':
-    infer()
--- a/fluid/PaddleRec/word2vec/preprocess.py
+++ b/fluid/PaddleRec/word2vec/preprocess.py
-"""
+# -*- coding: utf-8 -*
-Preprocess Criteo dataset. This dataset was used for the Display Advertising
-Challenge (https://www.kaggle.com/c/criteo-display-ad-challenge).
+import re
-"""
+import argparse
-import os
-import sys
-import click
+def parse_args():
-import random
+    parser = argparse.ArgumentParser(
-import collections
+        description="Paddle Fluid word2 vector preprocess")
+    parser.add_argument(
-# There are 13 integer features and 26 categorical features
+        '--data_path',
-continous_features = range(1, 14)
+        type=str,
-categorial_features = range(14, 40)
+        required=True,
+        help="The path of training dataset")
-# Clip integer features. The clip point for each integer feature
+    parser.add_argument(
-# is derived from the 95% quantile of the total values in each feature
+        '--dict_path',
-continous_clip = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
+        type=str,
+        default='./dict',
+        help="The path of generated dict")
-class CategoryDictGenerator:
+    parser.add_argument(
+        '--freq',
+        type=int,
+        default=5,
+        help="If the word count is less then freq, it will be removed from dict")
+    return parser.parse_args()
+def preprocess(data_path, dict_path, freq):
    """
-    Generate dictionary for each of the categorical features
+    proprocess the data, generate dictionary and save into dict_path.
+    :param data_path: the input data path.
+    :param dict_path: the generated dict path. the data in dict is "word count"
+    :param freq:
+    :return:
    """
+    # word to count
-    def __init__(self, num_feature):
+    word_count = dict()
-        self.dicts = []
-        self.num_feature = num_feature
+    with open(data_path) as f:
-        for i in range(0, num_feature):
+        for line in f:
-            self.dicts.append(collections.defaultdict(int))
+            line = line.lower()
+            line = re.sub("[^a-z ]", "", line)
-    def build(self, datafile, categorial_features, cutoff=0):
+            words = line.split()
-        with open(datafile, 'r') as f:
+            for item in words:
-            for line in f:
+                if item in word_count:
-                features = line.rstrip('\n').split('\t')
+                    word_count[item] = word_count[item] + 1
-                for i in range(0, self.num_feature):
+                else:
-                    if features[categorial_features[i]] != '':
+                    word_count[item] = 1
-                        self.dicts[i][features[categorial_features[i]]] += 1
+    item_to_remove = []
-        for i in range(0, self.num_feature):
+    for item in word_count:
-            self.dicts[i] = filter(lambda x: x[1] >= cutoff,
+        if word_count[item] <= freq:
-                                   self.dicts[i].items())
+            item_to_remove.append(item)
-            self.dicts[i] = sorted(self.dicts[i], key=lambda x: (-x[1], x[0]))
+    for item in item_to_remove:
-            vocabs, _ = list(zip(*self.dicts[i]))
+        del word_count[item]
-            self.dicts[i] = dict(zip(vocabs, range(1, len(vocabs) + 1)))
-            self.dicts[i]['<unk>'] = 0
+    with open(dict_path, 'w+') as f:
+        for k, v in word_count.items():
-    def gen(self, idx, key):
+            f.write(str(k) + " " + str(v) + '\n')
-        if key not in self.dicts[idx]:
-            res = self.dicts[idx]['<unk>']
-        else:
-            res = self.dicts[idx][key]
-        return res
-    def dicts_sizes(self):
-        return map(len, self.dicts)
-class ContinuousFeatureGenerator:
-    """
-    Normalize the integer features to [0, 1] by min-max normalization
-    """
-    def __init__(self, num_feature):
-        self.num_feature = num_feature
-        self.min = [sys.maxint] * num_feature
-        self.max = [-sys.maxint] * num_feature
-    def build(self, datafile, continous_features):
-        with open(datafile, 'r') as f:
-            for line in f:
-                features = line.rstrip('\n').split('\t')
-                for i in range(0, self.num_feature):
-                    val = features[continous_features[i]]
-                    if val != '':
-                        val = int(val)
-                        if val > continous_clip[i]:
-                            val = continous_clip[i]
-                        self.min[i] = min(self.min[i], val)
-                        self.max[i] = max(self.max[i], val)
-    def gen(self, idx, val):
-        if val == '':
-            return 0.0
-        val = float(val)
-        return (val - self.min[idx]) / (self.max[idx] - self.min[idx])
-@click.command("preprocess")
-@click.option("--datadir", type=str, help="Path to raw criteo dataset")
-@click.option("--outdir", type=str, help="Path to save the processed data")
-def preprocess(datadir, outdir):
-    """
-    All 13 integer features are normalized to continuous values and these continuous
-    features are combined into one vector with dimension of 13.
-    Each of the 26 categorical features are one-hot encoded and all the one-hot
-    vectors are combined into one sparse binary vector.
-    """
-    dists = ContinuousFeatureGenerator(len(continous_features))
-    dists.build(os.path.join(datadir, 'train.txt'), continous_features)
-    dicts = CategoryDictGenerator(len(categorial_features))
-    dicts.build(
-        os.path.join(datadir, 'train.txt'), categorial_features, cutoff=200)
-    dict_sizes = dicts.dicts_sizes()
-    categorial_feature_offset = [0]
-    for i in range(1, len(categorial_features)):
-        offset = categorial_feature_offset[i - 1] + dict_sizes[i - 1]
-        categorial_feature_offset.append(offset)
-    random.seed(0)
-    # 90% of the data are used for training, and 10% of the data are used
-    # for validation.
-    with open(os.path.join(outdir, 'train.txt'), 'w') as out_train:
-        with open(os.path.join(outdir, 'valid.txt'), 'w') as out_valid:
-            with open(os.path.join(datadir, 'train.txt'), 'r') as f:
-                for line in f:
-                    features = line.rstrip('\n').split('\t')
-                    continous_vals = []
-                    for i in range(0, len(continous_features)):
-                        val = dists.gen(i, features[continous_features[i]])
-                        continous_vals.append("{0:.6f}".format(val).rstrip('0')
-                                              .rstrip('.'))
-                    categorial_vals = []
-                    for i in range(0, len(categorial_features)):
-                        val = dicts.gen(i, features[categorial_features[
-                            i]]) + categorial_feature_offset[i]
-                        categorial_vals.append(str(val))
-                    continous_vals = ','.join(continous_vals)
-                    categorial_vals = ','.join(categorial_vals)
-                    label = features[0]
-                    if random.randint(0, 9999) % 10 != 0:
-                        out_train.write('\t'.join(
-                            [continous_vals, categorial_vals, label]) + '\n')
-                    else:
-                        out_valid.write('\t'.join(
-                            [continous_vals, categorial_vals, label]) + '\n')
-    with open(os.path.join(outdir, 'test.txt'), 'w') as out:
-        with open(os.path.join(datadir, 'test.txt'), 'r') as f:
-            for line in f:
-                features = line.rstrip('\n').split('\t')
-                continous_vals = []
-                for i in range(0, len(continous_features)):
-                    val = dists.gen(i, features[continous_features[i] - 1])
-                    continous_vals.append("{0:.6f}".format(val).rstrip('0')
-                                          .rstrip('.'))
-                categorial_vals = []
-                for i in range(0, len(categorial_features)):
-                    val = dicts.gen(i, features[categorial_features[
-                        i] - 1]) + categorial_feature_offset[i]
-                    categorial_vals.append(str(val))
-                continous_vals = ','.join(continous_vals)
-                categorial_vals = ','.join(categorial_vals)
-                out.write('\t'.join([continous_vals, categorial_vals]) + '\n')
 if __name__ == "__main__":
-    preprocess()
+    args = parse_args()
+    preprocess(args.data_path, args.dict_path, args.freq)
--- a/fluid/PaddleRec/word2vec/reader.py
+++ b/fluid/PaddleRec/word2vec/reader.py
 # -*- coding: utf-8 -*
-import time
 import numpy as np
-import random
-from collections import Counter
 """
-refs: https://github.com/NELSONZHAO/zhihu/blob/master/skip_gram/Skip-Gram-English-Corpus.ipynb
+enwik9 dataset
-text8 dataset
+http://mattmahoney.net/dc/enwik9.zip
-http://mattmahoney.net/dc/textdata.html
 """
-with open('data/text8.txt') as f:
-    text = f.read()
-def preprocess(text, freq=5):
-    '''
-    对文本进行预处理
-    参数
-    ---
-    text: 文本数据
-    freq: 词频阈值
-    '''
-    # 对文本中的符号进行替换
-    text = text.lower()
-    text = text.replace('.', ' <PERIOD> ')
-    text = text.replace(',', ' <COMMA> ')
-    text = text.replace('"', ' <QUOTATION_MARK> ')
-    text = text.replace(';', ' <SEMICOLON> ')
-    text = text.replace('!', ' <EXCLAMATION_MARK> ')
-    text = text.replace('?', ' <QUESTION_MARK> ')
-    text = text.replace('(', ' <LEFT_PAREN> ')
-    text = text.replace(')', ' <RIGHT_PAREN> ')
-    text = text.replace('--', ' <HYPHENS> ')
-    text = text.replace('?', ' <QUESTION_MARK> ')
-    # text = text.replace('\n', ' <NEW_LINE> ')
-    text = text.replace(':', ' <COLON> ')
-    words = text.split()
-    # 删除低频词，减少噪音影响
-    word_counts = Counter(words)
-    trimmed_words = [word for word in words if word_counts[word] > freq]
-    return trimmed_words
-# 清洗文本并分词
-words = preprocess(text)
-print(words[:20])
-# 构建映射表
-vocab = set(words)
-vocab_to_int = {w: c for c, w in enumerate(vocab)}
-dict_size = len(set(words))
-print("total words: {}".format(len(words)))
-print("unique words: {}".format(dict_size))
-# 对原文本进行vocab到int的转换
-int_words = [vocab_to_int[w] for w in words]
-t = 1e-5  # t值
-threshold = 0.8  # 剔除概率阈值
-# # 统计单词出现频次
-# int_word_counts = Counter(int_words)
-# total_count = len(int_words)
-# # 计算单词频率
-# word_freqs = {w: c/total_count for w, c in int_word_counts.items()}
-# # 计算被删除的概率
-# prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts}
-# # 对单词进行采样
-# train_words = [w for w in int_words if prob_drop[w] < threshold]
-train_words = int_words
-len(train_words)
-def get_targets(words, idx, window_size=5):
-    '''
-    获得input word的上下文单词列表
-    参数
-    ---
-    words: 单词列表
-    idx: input word的索引号
-    window_size: 窗口大小
-    '''
-    target_window = np.random.randint(1, window_size + 1)
-    # 这里要考虑input word前面单词不够的情况
-    start_point = idx - target_window if (idx - target_window) > 0 else 0
-    end_point = idx + target_window
-    # output words(即窗口中的上下文单词)
-    targets = set(words[start_point:idx] + words[idx + 1:end_point + 1])
-    return list(targets)
-def get_batches(words, batch_size, window_size=5):
-    def _reader():
-        '''
-        构造一个获取batch的生成器
-        '''
-        n_batches = len(words) // batch_size
-        # 仅取full batches
-        new_words = words[:n_batches * batch_size]
-        for idx in range(0, len(new_words), batch_size):
-            x, y = [], []
-            batch = new_words[idx:idx + batch_size]
-            for i in range(len(batch)):
-                batch_x = batch[i]
-                batch_y = get_targets(batch, i, window_size)
-                # 由于一个input word会对应多个output word，因此需要长度统一
-                x.extend([batch_x] * len(batch_y))
-                y.extend(batch_y)
-            for i in range(len(batch_y)):
-                yield [x[i]], [y[i]]
-    return _reader
+class Word2VecReader(object):
+    def __init__(self, dict_path, data_path, window_size=5):
+        self.window_size_ = window_size
+        self.data_path_ = data_path
+        self.word_to_id_ = dict()
+        word_id = 0
+        with open(dict_path, 'r') as f:
+            for line in f:
+                self.word_to_id_[line.split()[0]] = word_id
+                word_id += 1
+        self.dict_size = len(self.word_to_id_)
+        print("dict_size = " + str(self.dict_size))
+    def get_context_words(self, words, idx, window_size):
+        """
+        Get the context word list of target word.
+        words: the words of the current line
+        idx: input word index
+        window_size: window size
+        """
+        target_window = np.random.randint(1, window_size + 1)
+        # need to keep in mind that maybe there are no enough words before the target word.
+        start_point = idx - target_window if (idx - target_window) > 0 else 0
+        end_point = idx + target_window
+        # context words of the target word
+        targets = set(words[start_point:idx] + words[idx + 1:end_point + 1])
+        return list(targets)
+    def train(self):
+        def _reader():
+            with open(self.data_path_, 'r') as f:
+                for line in f:
+                    word_ids = [
+                        self.word_to_id_[word] for word in line.split()
+                        if word in self.word_to_id_
+                    ]
+                    for idx, target_id in enumerate(word_ids):
+                        context_word_ids = self.get_context_words(
+                            word_ids, idx, self.window_size_)
+                        for context_id in context_word_ids:
+                            yield [target_id], [context_id]
+        return _reader
 if __name__ == "__main__":
-    epochs = 10  # 迭代轮数
+    epochs = 10
-    batch_size = 1000  # batch大小
+    batch_size = 1000
-    window_size = 10  # 窗口大小
+    window_size = 10
-    batches = get_batches(train_words, batch_size, window_size)
+    reader = Word2VecReader("data/enwik9_dict", "data/enwik9", window_size)
    i = 0
-    for x, y in batches():
+    for x, y in reader.train()():
        print("x: " + str(x))
        print("y: " + str(y))
        print("\n")

--- a/fluid/PaddleRec/word2vec/train.py
+++ b/fluid/PaddleRec/word2vec/train.py
@@ -23,12 +23,17 @@ def parse_args():
    parser.add_argument(
        '--train_data_path',
        type=str,
-        default='./data/raw/train.txt',
+        default='./data/enwik9',
        help="The path of training dataset")
+    parser.add_argument(
+        '--dict_path',
+        type=str,
+        default='./data/enwik9_dict',
+        help="The path of data dict")
    parser.add_argument(
        '--test_data_path',
        type=str,
-        default='./data/raw/valid.txt',
+        default='./data/text8',
        help="The path of testing dataset")
    parser.add_argument(
        '--batch_size',
@@ -86,11 +91,11 @@ def parse_args():
    return parser.parse_args()
-def train_loop(args, train_program, data_list, loss, trainer_num, trainer_id):
+def train_loop(args, train_program, reader, data_list, loss, trainer_num,
-    dataset = reader.get_batches(reader.train_words, 5, 5)
+               trainer_id):
    train_reader = paddle.batch(
        paddle.reader.shuffle(
-            dataset, buf_size=args.batch_size * 100),
+            reader.train(), buf_size=args.batch_size * 100),
        batch_size=args.batch_size)
    place = fluid.CPUPlace()
@@ -124,14 +129,17 @@ def train():
    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)
-    loss, data_list = skip_gram_word2vec(reader.dict_size, args.embedding_size)
+    word2vec_reader = reader.Word2VecReader(args.dict_path,
+                                            args.train_data_path)
+    loss, data_list = skip_gram_word2vec(word2vec_reader.dict_size,
+                                         args.embedding_size)
    optimizer = fluid.optimizer.Adam(learning_rate=1e-3)
    optimizer.minimize(loss)
    if args.is_local:
        logger.info("run local training")
        main_program = fluid.default_main_program()
-        train_loop(args, main_program, data_list, loss, 1, -1)
+        train_loop(args, main_program, word2vec_reader, data_list, loss, 1, -1)
    else:
        logger.info("run dist training")
        t = fluid.DistributeTranspiler()
@@ -148,8 +156,8 @@ def train():
        elif args.role == "trainer":
            logger.info("run trainer")
            train_prog = t.get_trainer_program()
-            train_loop(args, train_prog, data_list, loss, args.trainers,
+            train_loop(args, train_prog, word2vec_reader, data_list, loss,
-                       args.trainer_id + 1)
+                       args.trainers, args.trainer_id + 1)
 if __name__ == '__main__':

--- a/fluid/__init__.py
+++ b/fluid/__init__.py