diff --git a/deep_fm/README.md b/deep_fm/README.md
index 588ee091db8f71e5b0018f8b61f81b3dc9f4e9b9..232868cae863767f062893046ed97e34faae97e5 100644
--- a/deep_fm/README.md
+++ b/deep_fm/README.md
@@ -1,5 +1,85 @@
-# DeepFM 基于深度因子分解机的点击率预测模型
+# Deep Factorization Machines (DeepFM) for Click-Through Rate prediction
 
-## 简介
+## Introduction
+This model implements the DeepFM proposed in the following paper:
 
-[TBD]
+```text
+    Huifeng Guo, Ruiming Tang, Yunming Ye, Zhenguo Li and Xiuqiang He. DeepFM:
+    A Factorization-Machine based Neural Network for CTR Prediction.
+    Proceedings of the Twenty-Sixth International Joint Conference on
+    Artificial Intelligence (IJCAI-17), 2017
+```
+
+The DeepFm combines factorization machines and deep neural networks to model
+both low order and high order feature interactions. For details of the
+factorization machines, please refer to the paper [factorization
+machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
+
+## Dataset
+This example uses Criteo dataset which was used for the [Display Advertising
+Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge/)
+hosted by Kaggle.
+
+Each row is the features for an ad display and the first column is a label
+indicating whether this ad has been clicked or not. There are 39 features in
+total. 13 features take integer values and the other 26 features are
+categorical features. For the test dataset, the labels are omitted.
+
+Download dataset:
+```bash
+cd data && ./download.sh && cd ..
+```
+
+## Model
+The DeepFM model is composed of the factorization machine layer (FM) and deep
+neural networks (DNN). All the input features are feeded to both FM and DNN.
+The output from FM and DNN are combined to form the final output. The embedding
+layer for sparse features in the DNN shares the parameters with the latent
+vectors (factors) of the FM layer.
+
+The factorization machine layer in PaddlePaddle computes the second order
+interactions. The following code example combines the factorization machine
+layer and fully connected layer to form the full version of factorization
+machine:
+
+```python
+def fm_layer(input, factor_size):
+    first_order = paddle.layer.fc(input=input, size=1, act=paddle.activation.Linear())
+    second_order = paddle.layer.factorization_machine(input=input, factor_size=factor_size)
+    fm = paddle.layer.addto(input=[first_order, second_order],
+                            act=paddle.activation.Linear(),
+                            ias_attr=False)
+    return fm
+```
+
+## Data preparation
+To preprocess the raw dataset, the integer features are clipped then min-max
+normalized to [0, 1] and the categorical features are one-hot encoded. The raw
+training dataset are splited such that 90% are used for training and the other
+10% are used for validation during training.
+
+```bash
+python preprocess.py --datadir ./data/raw --outdir ./data
+```
+
+## Train
+The command line options for training can be listed by `python train.py -h`.
+
+To train the model:
+```bash
+python train.py \
+        --train_data_path data/train.txt \
+        --test_data_path data/valid.txt \
+        2>&1 | train.log
+```
+
+## Infer
+The command line options for infering can be listed by `python infer.py -h`.
+
+To make inference for the test dataset:
+```bash
+python infer.py \
+        --model_gz_path models/model-pass-9-batch-10000.tar.gz \
+        --data_path data/test.txt \
+        --prediction_output_path ./predict.txt
+```
diff --git a/deep_fm/data/download.sh b/deep_fm/data/download.sh
index 1cadfe5a3ef5d266c20d0af3d99398f8a6057d16..466a22f2c6cc885cea0a1468f3043cb59c611b59 100755
--- a/deep_fm/data/download.sh
+++ b/deep_fm/data/download.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
-wget https://s3-eu-west-1.amazonaws.com/criteo-labs/dac.tar.gz
+wget --no-check-certificate https://s3-eu-west-1.amazonaws.com/criteo-labs/dac.tar.gz
 tar zxf dac.tar.gz
 rm -f dac.tar.gz
+
+mkdir raw
+mv ./*.txt raw/
diff --git a/deep_fm/images/DeepFM.png b/deep_fm/images/DeepFM.png
deleted file mode 100644
index 31444dbf4db65846209380a3e1eebe49cd1e6a73..0000000000000000000000000000000000000000
Binary files a/deep_fm/images/DeepFM.png and /dev/null differ
diff --git a/deep_fm/images/FM.png b/deep_fm/images/FM.png
deleted file mode 100644
index 469d636a07c41de68e4dc06513ccb4a5c1a898a3..0000000000000000000000000000000000000000
Binary files a/deep_fm/images/FM.png and /dev/null differ
diff --git a/deep_fm/network_conf.py b/deep_fm/network_conf.py
index 2382c0cdf13dbe12401a3279f95616523e0dd379..1857c8f60f959e9fd35ce9523af94470d4326bf5 100644
--- a/deep_fm/network_conf.py
+++ b/deep_fm/network_conf.py
@@ -14,7 +14,7 @@ def fm_layer(input, factor_size, fm_param_attr):
         param_attr=fm_param_attr)
     out = paddle.layer.addto(
         input=[first_order, second_order],
-        act=paddle.activation.Sigmoid(),
+        act=paddle.activation.Linear(),
         bias_attr=False)
     return out
 
@@ -68,6 +68,9 @@ def DeepFM(factor_size, infer=False):
             name="label", type=paddle.data_type.dense_vector(1))
         cost = paddle.layer.multi_binary_label_cross_entropy_cost(
             input=predict, label=label)
+        paddle.evaluator.classification_error(
+            name="classification_error", input=predict, label=label)
+        paddle.evaluator.auc(name="auc", input=predict, label=label)
         return cost
     else:
         return predict
diff --git a/deep_fm/preprocess.py b/deep_fm/preprocess.py
index 1995b1f4898f11961297ad8b97e72187445f6386..4e6f8a6a63ff7e4d603ab059b23b661444570163 100755
--- a/deep_fm/preprocess.py
+++ b/deep_fm/preprocess.py
@@ -5,12 +5,17 @@ Challenge (https://www.kaggle.com/c/criteo-display-ad-challenge).
 import os
 import sys
 import click
+import random
 import collections
 
 # There are 13 integer features and 26 categorical features
 continous_features = range(1, 14)
 categorial_features = range(14, 40)
 
+# Clip integer features. The clip point for each integer feature
+# is derived from the 95% quantile of the total values in each feature
+continous_clip = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
+
 
 class CategoryDictGenerator:
     """
@@ -67,12 +72,14 @@ class ContinuousFeatureGenerator:
                     val = features[continous_features[i]]
                     if val != '':
                         val = int(val)
+                        if val > continous_clip[i]:
+                            val = continous_clip[i]
                         self.min[i] = min(self.min[i], val)
                         self.max[i] = max(self.max[i], val)
 
     def gen(self, idx, val):
         if val == '':
-            return 0
+            return 0.0
         val = float(val)
         return (val - self.min[idx]) / (self.max[idx] - self.min[idx])
 
@@ -101,26 +108,36 @@ def preprocess(datadir, outdir):
         offset = categorial_feature_offset[i - 1] + dict_sizes[i - 1]
         categorial_feature_offset.append(offset)
 
-    with open(os.path.join(outdir, 'train.txt'), 'w') as out:
-        with open(os.path.join(datadir, 'train.txt'), 'r') as f:
-            for line in f:
-                features = line.rstrip('\n').split('\t')
-
-                continous_vals = []
-                for i in range(0, len(continous_features)):
-                    val = dists.gen(i, features[continous_features[i]])
-                    continous_vals.append(str(val))
-                categorial_vals = []
-                for i in range(0, len(categorial_features)):
-                    val = dicts.gen(i, features[categorial_features[
-                        i]]) + categorial_feature_offset[i]
-                    categorial_vals.append(str(val))
-
-                continous_vals = ','.join(continous_vals)
-                categorial_vals = ','.join(categorial_vals)
-                label = features[0]
-                out.write('\t'.join([continous_vals, categorial_vals, label]) +
-                          '\n')
+    random.seed(0)
+
+    # 90% of the data are used for training, and 10% of the data are used
+    # for validation.
+    with open(os.path.join(outdir, 'train.txt'), 'w') as out_train:
+        with open(os.path.join(outdir, 'valid.txt'), 'w') as out_valid:
+            with open(os.path.join(datadir, 'train.txt'), 'r') as f:
+                for line in f:
+                    features = line.rstrip('\n').split('\t')
+
+                    continous_vals = []
+                    for i in range(0, len(continous_features)):
+                        val = dists.gen(i, features[continous_features[i]])
+                        continous_vals.append(
+                            "{0:.6f}".format(val).rstrip('0').rstrip('.'))
+                    categorial_vals = []
+                    for i in range(0, len(categorial_features)):
+                        val = dicts.gen(i, features[categorial_features[
+                            i]]) + categorial_feature_offset[i]
+                        categorial_vals.append(str(val))
+
+                    continous_vals = ','.join(continous_vals)
+                    categorial_vals = ','.join(categorial_vals)
+                    label = features[0]
+                    if random.randint(0, 9999) % 10 != 0:
+                        out_train.write('\t'.join(
+                            [continous_vals, categorial_vals, label]) + '\n')
+                    else:
+                        out_valid.write('\t'.join(
+                            [continous_vals, categorial_vals, label]) + '\n')
 
     with open(os.path.join(outdir, 'test.txt'), 'w') as out:
         with open(os.path.join(datadir, 'test.txt'), 'r') as f:
@@ -130,7 +147,8 @@ def preprocess(datadir, outdir):
                 continous_vals = []
                 for i in range(0, len(continous_features)):
                     val = dists.gen(i, features[continous_features[i] - 1])
-                    continous_vals.append(str(val))
+                    continous_vals.append(
+                        "{0:.6f}".format(val).rstrip('0').rstrip('.'))
                 categorial_vals = []
                 for i in range(0, len(categorial_features)):
                     val = dicts.gen(i,
diff --git a/deep_fm/reader.py b/deep_fm/reader.py
index 2ac30ecc533f99f6d49a5bf02cdc6b15bc3fe0f9..1098ce423c9071864671be91dea81972e47fbc98 100644
--- a/deep_fm/reader.py
+++ b/deep_fm/reader.py
@@ -18,6 +18,9 @@ class Dataset:
     def train(self, path):
         return self._reader_creator(path, False)
 
+    def test(self, path):
+        return self._reader_creator(path, False)
+
     def infer(self, path):
         return self._reader_creator(path, True)
 
diff --git a/deep_fm/train.py b/deep_fm/train.py
index 2be7e7d990616cfffd4bbfce93e3687cba58c1c1..6f7889995f989d67ee2e4517e447d66d0c5e7a92 100755
--- a/deep_fm/train.py
+++ b/deep_fm/train.py
@@ -20,11 +20,16 @@ def parse_args():
         type=str,
         required=True,
         help="path of training dataset")
+    parser.add_argument(
+        '--test_data_path',
+        type=str,
+        required=True,
+        help="path of testing dataset")
     parser.add_argument(
         '--batch_size',
         type=int,
-        default=10000,
-        help="size of mini-batch (default:10000)")
+        default=1000,
+        help="size of mini-batch (default:1000)")
     parser.add_argument(
         '--num_passes',
         type=int,
@@ -52,7 +57,7 @@ def train():
 
     paddle.init(use_gpu=False, trainer_count=1)
 
-    optimizer = paddle.optimizer.Adam(learning_rate=1e-3)
+    optimizer = paddle.optimizer.Adam(learning_rate=1e-4)
 
     model = DeepFM(args.factor_size)
 
@@ -66,11 +71,22 @@ def train():
     def __event_handler__(event):
         if isinstance(event, paddle.event.EndIteration):
             num_samples = event.batch_id * args.batch_size
-            if event.batch_id % 10 == 0:
-                logger.warning("Pass %d, Batch %d, Samples %d, Cost %f" % (
-                    event.pass_id, event.batch_id, num_samples, event.cost))
+            if event.batch_id % 100 == 0:
+                logger.warning("Pass %d, Batch %d, Samples %d, Cost %f, %s" %
+                               (event.pass_id, event.batch_id, num_samples,
+                                event.cost, event.metrics))
+
+            if event.batch_id % 10000 == 0:
+                if args.test_data_path:
+                    result = trainer.test(
+                        reader=paddle.batch(
+                            dataset.test(args.test_data_path),
+                            batch_size=args.batch_size),
+                        feeding=reader.feeding)
+                    logger.warning("Test %d-%d, Cost %f, %s" %
+                                   (event.pass_id, event.batch_id, result.cost,
+                                    result.metrics))
 
-            if event.batch_id % 1000 == 0:
                 path = "{}/model-pass-{}-batch-{}.tar.gz".format(
                     args.model_output_dir, event.pass_id, event.batch_id)
                 with gzip.open(path, 'w') as f:
@@ -80,7 +96,7 @@ def train():
         reader=paddle.batch(
             paddle.reader.shuffle(
                 dataset.train(args.train_data_path),
-                buf_size=args.batch_size * 100),
+                buf_size=args.batch_size * 10000),
             batch_size=args.batch_size),
         feeding=reader.feeding,
         event_handler=__event_handler__,