diff --git a/deep_fm/README.md b/deep_fm/README.md index 588ee091db8f71e5b0018f8b61f81b3dc9f4e9b9..232868cae863767f062893046ed97e34faae97e5 100644 --- a/deep_fm/README.md +++ b/deep_fm/README.md @@ -1,5 +1,85 @@ -# DeepFM 基于深度因子分解机的点击率预测模型 +# Deep Factorization Machines (DeepFM) for Click-Through Rate prediction -## 简介 +## Introduction +This model implements the DeepFM proposed in the following paper: -[TBD] +```text + Huifeng Guo, Ruiming Tang, Yunming Ye, Zhenguo Li and Xiuqiang He. DeepFM: + A Factorization-Machine based Neural Network for CTR Prediction. + Proceedings of the Twenty-Sixth International Joint Conference on + Artificial Intelligence (IJCAI-17), 2017 +``` + +The DeepFm combines factorization machines and deep neural networks to model +both low order and high order feature interactions. For details of the +factorization machines, please refer to the paper [factorization +machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf) + +## Dataset +This example uses Criteo dataset which was used for the [Display Advertising +Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge/) +hosted by Kaggle. + +Each row is the features for an ad display and the first column is a label +indicating whether this ad has been clicked or not. There are 39 features in +total. 13 features take integer values and the other 26 features are +categorical features. For the test dataset, the labels are omitted. + +Download dataset: +```bash +cd data && ./download.sh && cd .. +``` + +## Model +The DeepFM model is composed of the factorization machine layer (FM) and deep +neural networks (DNN). All the input features are feeded to both FM and DNN. +The output from FM and DNN are combined to form the final output. The embedding +layer for sparse features in the DNN shares the parameters with the latent +vectors (factors) of the FM layer. + +The factorization machine layer in PaddlePaddle computes the second order +interactions. The following code example combines the factorization machine +layer and fully connected layer to form the full version of factorization +machine: + +```python +def fm_layer(input, factor_size): + first_order = paddle.layer.fc(input=input, size=1, act=paddle.activation.Linear()) + second_order = paddle.layer.factorization_machine(input=input, factor_size=factor_size) + fm = paddle.layer.addto(input=[first_order, second_order], + act=paddle.activation.Linear(), + ias_attr=False) + return fm +``` + +## Data preparation +To preprocess the raw dataset, the integer features are clipped then min-max +normalized to [0, 1] and the categorical features are one-hot encoded. The raw +training dataset are splited such that 90% are used for training and the other +10% are used for validation during training. + +```bash +python preprocess.py --datadir ./data/raw --outdir ./data +``` + +## Train +The command line options for training can be listed by `python train.py -h`. + +To train the model: +```bash +python train.py \ + --train_data_path data/train.txt \ + --test_data_path data/valid.txt \ + 2>&1 | train.log +``` + +## Infer +The command line options for infering can be listed by `python infer.py -h`. + +To make inference for the test dataset: +```bash +python infer.py \ + --model_gz_path models/model-pass-9-batch-10000.tar.gz \ + --data_path data/test.txt \ + --prediction_output_path ./predict.txt +``` diff --git a/deep_fm/data/download.sh b/deep_fm/data/download.sh index 1cadfe5a3ef5d266c20d0af3d99398f8a6057d16..466a22f2c6cc885cea0a1468f3043cb59c611b59 100755 --- a/deep_fm/data/download.sh +++ b/deep_fm/data/download.sh @@ -1,5 +1,8 @@ #!/bin/bash -wget https://s3-eu-west-1.amazonaws.com/criteo-labs/dac.tar.gz +wget --no-check-certificate https://s3-eu-west-1.amazonaws.com/criteo-labs/dac.tar.gz tar zxf dac.tar.gz rm -f dac.tar.gz + +mkdir raw +mv ./*.txt raw/ diff --git a/deep_fm/images/DeepFM.png b/deep_fm/images/DeepFM.png deleted file mode 100644 index 31444dbf4db65846209380a3e1eebe49cd1e6a73..0000000000000000000000000000000000000000 Binary files a/deep_fm/images/DeepFM.png and /dev/null differ diff --git a/deep_fm/images/FM.png b/deep_fm/images/FM.png deleted file mode 100644 index 469d636a07c41de68e4dc06513ccb4a5c1a898a3..0000000000000000000000000000000000000000 Binary files a/deep_fm/images/FM.png and /dev/null differ diff --git a/deep_fm/network_conf.py b/deep_fm/network_conf.py index 2382c0cdf13dbe12401a3279f95616523e0dd379..1857c8f60f959e9fd35ce9523af94470d4326bf5 100644 --- a/deep_fm/network_conf.py +++ b/deep_fm/network_conf.py @@ -14,7 +14,7 @@ def fm_layer(input, factor_size, fm_param_attr): param_attr=fm_param_attr) out = paddle.layer.addto( input=[first_order, second_order], - act=paddle.activation.Sigmoid(), + act=paddle.activation.Linear(), bias_attr=False) return out @@ -68,6 +68,9 @@ def DeepFM(factor_size, infer=False): name="label", type=paddle.data_type.dense_vector(1)) cost = paddle.layer.multi_binary_label_cross_entropy_cost( input=predict, label=label) + paddle.evaluator.classification_error( + name="classification_error", input=predict, label=label) + paddle.evaluator.auc(name="auc", input=predict, label=label) return cost else: return predict diff --git a/deep_fm/preprocess.py b/deep_fm/preprocess.py index 1995b1f4898f11961297ad8b97e72187445f6386..4e6f8a6a63ff7e4d603ab059b23b661444570163 100755 --- a/deep_fm/preprocess.py +++ b/deep_fm/preprocess.py @@ -5,12 +5,17 @@ Challenge (https://www.kaggle.com/c/criteo-display-ad-challenge). import os import sys import click +import random import collections # There are 13 integer features and 26 categorical features continous_features = range(1, 14) categorial_features = range(14, 40) +# Clip integer features. The clip point for each integer feature +# is derived from the 95% quantile of the total values in each feature +continous_clip = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50] + class CategoryDictGenerator: """ @@ -67,12 +72,14 @@ class ContinuousFeatureGenerator: val = features[continous_features[i]] if val != '': val = int(val) + if val > continous_clip[i]: + val = continous_clip[i] self.min[i] = min(self.min[i], val) self.max[i] = max(self.max[i], val) def gen(self, idx, val): if val == '': - return 0 + return 0.0 val = float(val) return (val - self.min[idx]) / (self.max[idx] - self.min[idx]) @@ -101,26 +108,36 @@ def preprocess(datadir, outdir): offset = categorial_feature_offset[i - 1] + dict_sizes[i - 1] categorial_feature_offset.append(offset) - with open(os.path.join(outdir, 'train.txt'), 'w') as out: - with open(os.path.join(datadir, 'train.txt'), 'r') as f: - for line in f: - features = line.rstrip('\n').split('\t') - - continous_vals = [] - for i in range(0, len(continous_features)): - val = dists.gen(i, features[continous_features[i]]) - continous_vals.append(str(val)) - categorial_vals = [] - for i in range(0, len(categorial_features)): - val = dicts.gen(i, features[categorial_features[ - i]]) + categorial_feature_offset[i] - categorial_vals.append(str(val)) - - continous_vals = ','.join(continous_vals) - categorial_vals = ','.join(categorial_vals) - label = features[0] - out.write('\t'.join([continous_vals, categorial_vals, label]) + - '\n') + random.seed(0) + + # 90% of the data are used for training, and 10% of the data are used + # for validation. + with open(os.path.join(outdir, 'train.txt'), 'w') as out_train: + with open(os.path.join(outdir, 'valid.txt'), 'w') as out_valid: + with open(os.path.join(datadir, 'train.txt'), 'r') as f: + for line in f: + features = line.rstrip('\n').split('\t') + + continous_vals = [] + for i in range(0, len(continous_features)): + val = dists.gen(i, features[continous_features[i]]) + continous_vals.append( + "{0:.6f}".format(val).rstrip('0').rstrip('.')) + categorial_vals = [] + for i in range(0, len(categorial_features)): + val = dicts.gen(i, features[categorial_features[ + i]]) + categorial_feature_offset[i] + categorial_vals.append(str(val)) + + continous_vals = ','.join(continous_vals) + categorial_vals = ','.join(categorial_vals) + label = features[0] + if random.randint(0, 9999) % 10 != 0: + out_train.write('\t'.join( + [continous_vals, categorial_vals, label]) + '\n') + else: + out_valid.write('\t'.join( + [continous_vals, categorial_vals, label]) + '\n') with open(os.path.join(outdir, 'test.txt'), 'w') as out: with open(os.path.join(datadir, 'test.txt'), 'r') as f: @@ -130,7 +147,8 @@ def preprocess(datadir, outdir): continous_vals = [] for i in range(0, len(continous_features)): val = dists.gen(i, features[continous_features[i] - 1]) - continous_vals.append(str(val)) + continous_vals.append( + "{0:.6f}".format(val).rstrip('0').rstrip('.')) categorial_vals = [] for i in range(0, len(categorial_features)): val = dicts.gen(i, diff --git a/deep_fm/reader.py b/deep_fm/reader.py index 2ac30ecc533f99f6d49a5bf02cdc6b15bc3fe0f9..1098ce423c9071864671be91dea81972e47fbc98 100644 --- a/deep_fm/reader.py +++ b/deep_fm/reader.py @@ -18,6 +18,9 @@ class Dataset: def train(self, path): return self._reader_creator(path, False) + def test(self, path): + return self._reader_creator(path, False) + def infer(self, path): return self._reader_creator(path, True) diff --git a/deep_fm/train.py b/deep_fm/train.py index 2be7e7d990616cfffd4bbfce93e3687cba58c1c1..6f7889995f989d67ee2e4517e447d66d0c5e7a92 100755 --- a/deep_fm/train.py +++ b/deep_fm/train.py @@ -20,11 +20,16 @@ def parse_args(): type=str, required=True, help="path of training dataset") + parser.add_argument( + '--test_data_path', + type=str, + required=True, + help="path of testing dataset") parser.add_argument( '--batch_size', type=int, - default=10000, - help="size of mini-batch (default:10000)") + default=1000, + help="size of mini-batch (default:1000)") parser.add_argument( '--num_passes', type=int, @@ -52,7 +57,7 @@ def train(): paddle.init(use_gpu=False, trainer_count=1) - optimizer = paddle.optimizer.Adam(learning_rate=1e-3) + optimizer = paddle.optimizer.Adam(learning_rate=1e-4) model = DeepFM(args.factor_size) @@ -66,11 +71,22 @@ def train(): def __event_handler__(event): if isinstance(event, paddle.event.EndIteration): num_samples = event.batch_id * args.batch_size - if event.batch_id % 10 == 0: - logger.warning("Pass %d, Batch %d, Samples %d, Cost %f" % ( - event.pass_id, event.batch_id, num_samples, event.cost)) + if event.batch_id % 100 == 0: + logger.warning("Pass %d, Batch %d, Samples %d, Cost %f, %s" % + (event.pass_id, event.batch_id, num_samples, + event.cost, event.metrics)) + + if event.batch_id % 10000 == 0: + if args.test_data_path: + result = trainer.test( + reader=paddle.batch( + dataset.test(args.test_data_path), + batch_size=args.batch_size), + feeding=reader.feeding) + logger.warning("Test %d-%d, Cost %f, %s" % + (event.pass_id, event.batch_id, result.cost, + result.metrics)) - if event.batch_id % 1000 == 0: path = "{}/model-pass-{}-batch-{}.tar.gz".format( args.model_output_dir, event.pass_id, event.batch_id) with gzip.open(path, 'w') as f: @@ -80,7 +96,7 @@ def train(): reader=paddle.batch( paddle.reader.shuffle( dataset.train(args.train_data_path), - buf_size=args.batch_size * 100), + buf_size=args.batch_size * 10000), batch_size=args.batch_size), feeding=reader.feeding, event_handler=__event_handler__,