From c16dd3df7a8ebbadab8263d4f7d726a90a323a5a Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Wed, 31 May 2017 14:07:35 +0800
Subject: [PATCH] add more usage in dataset

---
 ctr/dataset.md | 81 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 77 insertions(+), 4 deletions(-)

diff --git a/ctr/dataset.md b/ctr/dataset.md
index 62511be2..86426d48 100644
--- a/ctr/dataset.md
+++ b/ctr/dataset.md
@@ -50,7 +50,7 @@
 类别类特征的提取方法有以下两种：
 
 1.  One-hot 表示作为特征
-2.  类似词向量，用一个 Embedding Table 将每个类别映射到对应的向量
+2.  类似词向量，用一个 Embedding 将每个类别映射到对应的向量
 
 
 ### ID 类特征
@@ -119,6 +119,42 @@ class CategoryFeatureGenerator(object):
         return '<CategoryFeatureGenerator %d>' % len(self.dic)
 ```
 
+`CategoryFeatureGenerator` 需要先扫描数据集，得到该类别对应的项集合，之后才能开始生成特征。
+
+我们的实验数据集[\[3\]](https://www.kaggle.com/c/avazu-ctr-prediction/data)已经经过shuffle，可以扫描前面一定数目的记录来近似总的类别项集合（等价于随机抽样），
+对于没有抽样上的低频类别项，可以用一个 UNK 的特殊值表示。
+
+```python
+fields = {}
+for key in categorial_features:
+    fields[key] = CategoryFeatureGenerator()
+
+def detect_dataset(path, topn, id_fea_space=10000):
+    '''
+    Parse the first `topn` records to collect meta information of this dataset.
+
+    NOTE the records should be randomly shuffled first.
+    '''
+    # create categorical statis objects.
+
+    with open(path, 'rb') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_id, row in enumerate(reader):
+            if row_id > topn:
+                break
+
+            for key in categorial_features:
+                fields[key].register(row[key])
+```
+
+`CategoryFeatureGenerator` 在注册得到数据集中对应类别信息后，可以对相应记录生成对应的特征表示：
+
+```python
+record = []
+for key in categorial_features:
+    record.append(fields[key].gen(row[key]))
+```
+
 本任务中，类别类特征会输入到 DNN 中使用。
 
 ### ID 类特征
@@ -145,6 +181,15 @@ class IDfeatureGenerator(object):
         return self.max_dim
 ```
 
+`IDfeatureGenerator` 不需要预先初始化，可以直接生成特征，比如
+
+```python
+record = []
+for key in id_features:
+    if 'cross' not in key:
+        record.append(fields[key].gen(row[key]))
+```
+
 ### 交叉类特征
 
 LR 模型作为 Wide & Deep model 的 `wide` 部分，可以输入很 wide 的数据（特征空间的维度很大），
@@ -161,10 +206,17 @@ def gen_cross_fea(self, fea1, fea2):
 比如，我们觉得原始数据中， `device_id` 和 `site_id` 有一些关联（比如某个 device 倾向于浏览特定 site)，
 我们通过组合出两者组合来捕捉这类信息。
 
+```python
+fea0 = fields[key].cross_fea0
+fea1 = fields[key].cross_fea1
+record.append(
+    fields[key].gen_cross_fea(row[fea0], row[fea1]))
+```
+
 ### 特征维度
 #### Deep submodel(DNN)特征
 | feature          | dimention |
-|------------------+-----------|
+|------------------|-----------|
 | app_category     |        21 |
 | site_category    |        22 |
 | device_conn_type |         5 |
@@ -174,7 +226,7 @@ def gen_cross_fea(self, fea1, fea2):
 
 #### Wide submodel(LR)特征
 | Feature             | Dimention |
-|---------------------+-----------|
+|---------------------|-----------|
 | id                  |     10000 |
 | site_id             |     10000 |
 | app_id              |     10000 |
@@ -184,7 +236,7 @@ def gen_cross_fea(self, fea1, fea2):
 
 ## 输入到 PaddlePaddle 中
 
-Deep 和 Wide 两部分均以 `sparse_binary_vector` 的格式[1]输入，输入前需要将相关特征拼合，模型最终只接受 3 个 input，
+Deep 和 Wide 两部分均以 `sparse_binary_vector` 的格式 [\[1\]](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/api/v1/data_provider/pydataprovider2_en.rst) 输入，输入前需要将相关特征拼合，模型最终只接受 3 个 input，
 分别是
 
 1.  `dnn input` ，DNN 的输入
@@ -213,4 +265,25 @@ def concat_sparse_vectors(inputs, dims):
     return res
 ```
 
+生成最终特征的代码如下：
+
+```python
+# dimentions of the features
+categorial_dims = [
+    feature_dims[key] for key in categorial_features + ['hour']
+]
+id_dims = [feature_dims[key] for key in id_features]
+
+dense_input = concat_sparse_vectors(record, categorial_dims)
+sparse_input = concat_sparse_vectors(record, id_dims)
+
+record = [dense_input, sparse_input]
+record.append(list((int(row['click']), )))
+yield record
+```
+
+## 参考文献
+
 1. <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/api/v1/data_provider/pydataprovider2_en.rst>
+2. Mikolov T, Deoras A, Povey D, et al. [Strategies for training large scale neural network language models](https://www.researchgate.net/profile/Lukas_Burget/publication/241637478_Strategies_for_training_large_scale_neural_network_language_models/links/542c14960cf27e39fa922ed3.pdf)[C]//Automatic Speech Recognition and Understanding (ASRU), 2011 IEEE Workshop on. IEEE, 2011: 196-201.
+3. <https://www.kaggle.com/c/avazu-ctr-prediction/data>
-- 
GitLab