Merge branch 'develop' of https://github.com/PaddlePaddle/models into ocr_ctc

744908f0 · wanghaoshuang · fb8ae401 · bc8b6040 · 744908f0 · 744908f0
50 changed file
--- a/.travis/unittest.sh
+++ b/.travis/unittest.sh
@@ -24,11 +24,22 @@ unittest(){
 trap 'abort' 0
 set -e
-for proj in */ ; do
+for proj in * ; do
    if [ -d $proj ]; then
-        unittest $proj
+        if [ "$proj" = "fluid" ]; then
-        if [ $? != 0 ]; then
+            for proj in fluid/* ; do
-            exit 1
+                if [ -d $proj ]; then
+                    unittest $proj
+                    if [ $? != 0 ]; then
+                        exit 1
+                    fi
+                fi
+            done
+        else
+            unittest $proj
+            if [ $? != 0 ]; then
+                exit 1
+            fi
        fi
    fi
 done

--- a/fluid/DeepASR/README.md
+++ b/fluid/DeepASR/README.md
+Deep ASR Kickoff
--- a/fluid/DeepASR/data_utils/__init__.py
+++ b/fluid/DeepASR/data_utils/__init__.py
--- a/fluid/DeepASR/data_utils/augmentor/__init__.py
+++ b/fluid/DeepASR/data_utils/augmentor/__init__.py
--- a/fluid/DeepASR/data_utils/augmentor/tests/data/global_mean_var_search26kHr
+++ b/fluid/DeepASR/data_utils/augmentor/tests/data/global_mean_var_search26kHr
+16.2845556399 11.6891798673
+17.21509949 12.3788567902
+18.1143704548 14.9912618017
+19.2335963752 18.5419556172
+19.9266772451 21.2768220522
+19.8245737202 21.2347210705
+19.5432940972 20.2784036567
+19.4631271754 20.2934452329
+19.3929919324 20.457971868
+19.2924788362 20.3626439234
+18.9207244502 19.9196569759
+18.7202605641 19.5920276899
+18.4844279398 19.2068349019
+18.2670948624 18.8716893824
+18.0929628855 18.5439666541
+17.8428896026 18.0255891747
+17.6646850635 17.473764296
+17.4955705896 16.8966859471
+17.3706720293 16.4294027467
+17.2530867792 16.0514717623
+17.1304341172 15.7234699057
+17.0038353287 15.4344471514
+16.902550309 15.1603287337
+16.8375590047 14.9304337826
+16.816287853 14.9119310513
+16.828838265 15.0930023024
+16.8602209498 15.3771992423
+16.9101763812 15.6897991789
+16.9466065143 15.9364556489
+16.9486061956 16.0699417826
+16.9041374104 16.0796970272
+16.8410093699 16.0111444599
+16.7045718836 15.7991985601
+16.51128489 15.5208920129
+16.3253910608 15.2603181921
+16.1297317333 14.9499965958
+15.903428372 14.5958280409
+15.6131718105 14.2709618
+15.1395035533 13.9993939893
+14.4298229999 13.3841189151
+0.0034970565424 0.246184766149
+0.00501284154705 0.238484972472
+0.00605942680019 0.269064381708
+0.00687266156243 0.319479238011
+0.00734065019253 0.371947383205
+0.00718807218417 0.384426479694
+0.00652195540212 0.384676838281
+0.00660416525951 0.395543910317
+0.00680202057642 0.400803979681
+0.00659144183007 0.393228973031
+0.00605294530423 0.385021118038
+0.00590452969394 0.361763039625
+0.00612315374687 0.346777773373
+0.00582354093973 0.335802403976
+0.00574556002554 0.320733728218
+0.00612254485891 0.310153103033
+0.00626733043219 0.299854747445
+0.00567398408041 0.293353685493
+0.00519236700706 0.287668810947
+0.00529581474367 0.281479660772
+0.00479019484082 0.27451415777
+0.00486381039428 0.266294391154
+0.00491126372868 0.258105116126
+0.00452105305011 0.252926328298
+0.00531483334271 0.250910887373
+0.00546572110469 0.253302256977
+0.00479544857908 0.258484183394
+0.00422106426297 0.264582900173
+0.00401824135188 0.268467945623
+0.0041705465252 0.269699480291
+0.00405239564143 0.270406162975
+0.0040059737566 0.270407601782
+0.00406426729317 0.267951582656
+0.00416613791013 0.264543833042
+0.00427847607653 0.26247798891
+0.00428050903034 0.259635263243
+0.00454842971786 0.255829377617
+0.00393747552387 0.253802307025
+0.00374143688909 0.251011478787
+0.00335475310258 0.236543650856
+0.000373194755312 0.0419494800709
+0.000230909648678 0.0394102370205
+0.000150840015851 0.0414956922398
+8.44401840771e-05 0.0460502231327
+-6.24759314572e-06 0.0528049937739
+-8.82957758148e-05 0.055711244886
+1.16795791952e-05 0.0563188428833
+-1.68716267856e-05 0.0575232763711
+-0.000112625308645 0.057979929947
+-0.000122619090002 0.0564126233493
+1.73569637319e-05 0.05522573909
+6.49872782342e-05 0.0507353361334
+4.17746389178e-05 0.0479568131253
+5.13884475653e-05 0.0461253238047
+1.8860115143e-05 0.0436860476919
+-5.64317701105e-05 0.042516381059
+-0.000136859948115 0.0413574820205
+-7.00847019726e-05 0.0409516370727
+-5.39392223336e-05 0.040441504085
+-9.24897162815e-05 0.0397800398173
+4.7104970622e-05 0.039046286243
+6.24805896165e-06 0.0380185986602
+-2.35272813418e-05 0.036851063786
+5.88344154127e-05 0.0361640489242
+-8.39162076993e-05 0.0357639427311
+-0.000108702805776 0.0358774639538
+3.22013961834e-06 0.0363644530435
+9.43501518394e-05 0.0370309934774
+0.000134406229423 0.0374972993343
+3.84007008533e-05 0.037676222515
+3.05989328157e-05 0.0379111939182
+9.52201629091e-05 0.0380927209106
+0.000102126083729 0.0379925358499
+6.98628072264e-05 0.0377276252241
+4.55782256339e-05 0.0375165468654
+4.76370987786e-05 0.0371482526345
+-2.24128832709e-05 0.0366810742947
+0.000125621306953 0.036628355271
+0.000134568666093 0.0364860461759
+0.000159858844464 0.0345583593149
--- a/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py
+++ b/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import unittest
+import numpy as np
+import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
+import data_utils.augmentor.trans_add_delta as trans_add_delta
+import data_utils.augmentor.trans_splice as trans_splice
+class TestTransMeanVarianceNorm(unittest.TestCase):
+    """unit test for TransMeanVarianceNorm
+    """
+    def setUp(self):
+        self._file_path = "./data_utils/augmentor/tests/data/" \
+                          "global_mean_var_search26kHr"
+    def test(self):
+        feature = np.zeros((2, 120), dtype="float32")
+        feature.fill(1)
+        trans = trans_mean_variance_norm.TransMeanVarianceNorm(self._file_path)
+        (feature1, label1) = trans.perform_trans((feature, None))
+        (mean, var) = trans.get_mean_var()
+        feature_flat1 = feature1.flatten()
+        feature_flat = feature.flatten()
+        one = np.ones((1), dtype="float32")
+        for idx, val in enumerate(feature_flat1):
+            cur_idx = idx % 120
+            self.assertAlmostEqual(val, (one[0] - mean[cur_idx]) * var[cur_idx])
+class TestTransAddDelta(unittest.TestCase):
+    """unit test TestTransAddDelta
+    """
+    def test_regress(self):
+        """test regress
+        """
+        feature = np.zeros((14, 120), dtype="float32")
+        feature[0:5, 0:40].fill(1)
+        feature[0 + 5, 0:40].fill(1)
+        feature[1 + 5, 0:40].fill(2)
+        feature[2 + 5, 0:40].fill(3)
+        feature[3 + 5, 0:40].fill(4)
+        feature[8:14, 0:40].fill(4)
+        trans = trans_add_delta.TransAddDelta()
+        feature = feature.reshape((14 * 120))
+        trans._regress(feature, 5 * 120, feature, 5 * 120 + 40, 40, 4, 120)
+        trans._regress(feature, 5 * 120 + 40, feature, 5 * 120 + 80, 40, 4, 120)
+        feature = feature.reshape((14, 120))
+        tmp_feature = feature[5:5 + 4, :]
+        self.assertAlmostEqual(1.0, tmp_feature[0][0])
+        self.assertAlmostEqual(0.24, tmp_feature[0][119])
+        self.assertAlmostEqual(2.0, tmp_feature[1][0])
+        self.assertAlmostEqual(0.13, tmp_feature[1][119])
+        self.assertAlmostEqual(3.0, tmp_feature[2][0])
+        self.assertAlmostEqual(-0.13, tmp_feature[2][119])
+        self.assertAlmostEqual(4.0, tmp_feature[3][0])
+        self.assertAlmostEqual(-0.24, tmp_feature[3][119])
+    def test_perform(self):
+        """test perform
+        """
+        feature = np.zeros((4, 40), dtype="float32")
+        feature[0, 0:40].fill(1)
+        feature[1, 0:40].fill(2)
+        feature[2, 0:40].fill(3)
+        feature[3, 0:40].fill(4)
+        trans = trans_add_delta.TransAddDelta()
+        (feature, label) = trans.perform_trans((feature, None))
+        self.assertAlmostEqual(feature.shape[0], 4)
+        self.assertAlmostEqual(feature.shape[1], 120)
+        self.assertAlmostEqual(1.0, feature[0][0])
+        self.assertAlmostEqual(0.24, feature[0][119])
+        self.assertAlmostEqual(2.0, feature[1][0])
+        self.assertAlmostEqual(0.13, feature[1][119])
+        self.assertAlmostEqual(3.0, feature[2][0])
+        self.assertAlmostEqual(-0.13, feature[2][119])
+        self.assertAlmostEqual(4.0, feature[3][0])
+        self.assertAlmostEqual(-0.24, feature[3][119])
+class TestTransSplict(unittest.TestCase):
+    """unit test Test TransSplict
+    """
+    def test_perfrom(self):
+        feature = np.zeros((8, 10), dtype="float32")
+        for i in xrange(feature.shape[0]):
+            feature[i, :].fill(i)
+        trans = trans_splice.TransSplice()
+        (feature, label) = trans.perform_trans((feature, None))
+        self.assertEqual(feature.shape[1], 110)
+        for i in xrange(8):
+            nzero_num = 5 - i
+            cur_val = 0.0
+            if nzero_num < 0:
+                cur_val = i - 5 - 1
+            for j in xrange(11):
+                if j <= nzero_num:
+                    for k in xrange(10):
+                        self.assertAlmostEqual(feature[i][j * 10 + k], cur_val)
+                else:
+                    if cur_val < 7:
+                        cur_val += 1.0
+                    for k in xrange(10):
+                        self.assertAlmostEqual(feature[i][j * 10 + k], cur_val)
+if __name__ == '__main__':
+    unittest.main()
--- a/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py
+++ b/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import math
+import copy
+class TransAddDelta(object):
+    """ add delta of feature data 
+        trans feature for shape(a, b) to shape(a, b * 3)
+        Attributes:
+            _norder(int):
+            _window(int):
+    """
+    def __init__(self, norder=2, nwindow=2):
+        """ init construction
+            Args:
+                norder: default 2 
+                nwindow: default 2
+        """
+        self._norder = norder
+        self._nwindow = nwindow
+    def perform_trans(self, sample):
+        """ add delta for feature
+            trans feature shape from (a,b) to (a, b * 3)
+            Args: 
+                sample(object,tuple): contain feature numpy and label numpy
+            Returns:
+                (feature, label)
+        """
+        (feature, label) = sample
+        frame_dim = feature.shape[1]
+        d_frame_dim = frame_dim * 3
+        head_filled = 5
+        tail_filled = 5
+        mat = np.zeros(
+            (feature.shape[0] + head_filled + tail_filled, d_frame_dim),
+            dtype="float32")
+        #copy first frame
+        for i in xrange(head_filled):
+            np.copyto(mat[i, 0:frame_dim], feature[0, :])
+        np.copyto(mat[head_filled:head_filled + feature.shape[0], 0:frame_dim],
+                  feature[:, :])
+        # copy last frame
+        for i in xrange(head_filled + feature.shape[0], mat.shape[0], 1):
+            np.copyto(mat[i, 0:frame_dim], feature[feature.shape[0] - 1, :])
+        nframe = feature.shape[0]
+        start = head_filled
+        tmp_shape = mat.shape
+        mat = mat.reshape((tmp_shape[0] * tmp_shape[1]))
+        self._regress(mat, start * d_frame_dim, mat,
+                      start * d_frame_dim + frame_dim, frame_dim, nframe,
+                      d_frame_dim)
+        self._regress(mat, start * d_frame_dim + frame_dim, mat,
+                      start * d_frame_dim + 2 * frame_dim, frame_dim, nframe,
+                      d_frame_dim)
+        mat.shape = tmp_shape
+        return (mat[head_filled:mat.shape[0] - tail_filled, :], label)
+    def _regress(self, data_in, start_in, data_out, start_out, size, n, step):
+        """ regress
+            Args:
+                data_in: in data
+                start_in: start index of data_in
+                data_out: out data
+                start_out: start index of data_out
+                size: frame dimentional
+                n: frame num
+                step: 3 * (frame num)
+            Returns:
+                None
+        """
+        sigma_t2 = 0.0
+        delta_window = self._nwindow
+        for t in xrange(1, delta_window + 1):
+            sigma_t2 += t * t
+        sigma_t2 *= 2.0
+        for i in xrange(n):
+            fp1 = start_in
+            fp2 = start_out
+            for j in xrange(size):
+                back = fp1
+                forw = fp1
+                sum = 0.0
+                for t in xrange(1, delta_window + 1):
+                    back -= step
+                    forw += step
+                    sum += t * (data_in[forw] - data_in[back])
+                data_out[fp2] = sum / sigma_t2
+                fp1 += 1
+                fp2 += 1
+            start_in += step
+            start_out += step
--- a/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py
+++ b/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import math
+class TransMeanVarianceNorm(object):
+    """ normalization of mean variance for feature data 
+        Attributes:
+            _mean(numpy.array): the feature mean vector
+            _var(numpy.array): the feature variance 
+    """
+    def __init__(self, snorm_path):
+        """init construction
+            Args:
+                snorm_path: the path of mean and variance
+        """
+        self._mean = None
+        self._var = None
+        self._load_norm(snorm_path)
+    def _load_norm(self, snorm_path):
+        """ load mean var file
+            Args: 
+                snorm_path(str):the file path
+        """
+        lLines = open(snorm_path).readlines()
+        nLen = len(lLines)
+        self._mean = np.zeros((nLen), dtype="float32")
+        self._var = np.zeros((nLen), dtype="float32")
+        self._nLen = nLen
+        for nidx, l in enumerate(lLines):
+            s = l.split()
+            assert len(s) == 2
+            self._mean[nidx] = float(s[0])
+            self._var[nidx] = 1.0 / math.sqrt(float(s[1]))
+            if self._var[nidx] > 100000.0:
+                self._var[nidx] = 100000.0
+    def get_mean_var(self):
+        """ get mean and var 
+            Args:
+            Returns:
+                (mean, var)
+        """
+        return (self._mean, self._var)
+    def perform_trans(self, sample):
+        """ feature = (feature - mean) * var
+            Args:
+                sample(object):input sample, contain feature numpy and label numpy
+            Returns:
+                (feature, label)
+        """
+        (feature, label) = sample
+        shape = feature.shape
+        assert len(shape) == 2
+        nfeature_len = shape[0] * shape[1]
+        assert nfeature_len % self._nLen == 0
+        ncur_idx = 0
+        feature = feature.reshape((nfeature_len))
+        while ncur_idx < nfeature_len:
+            block = feature[ncur_idx:ncur_idx + self._nLen]
+            block = (block - self._mean) * self._var
+            feature[ncur_idx:ncur_idx + self._nLen] = block
+            ncur_idx += self._nLen
+        feature = feature.reshape(shape)
+        return (feature, label)
--- a/fluid/DeepASR/data_utils/augmentor/trans_splice.py
+++ b/fluid/DeepASR/data_utils/augmentor/trans_splice.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import math
+class TransSplice(object):
+    """ copy feature context to construct new feature
+        expand feature data from shape (frame_num, frame_dim) 
+        to shape (frame_num, frame_dim * 11)
+        Attributes:
+            _nleft_context(int): copy left context number
+            _nright_context(int): copy right context number
+    """
+    def __init__(self, nleft_context=5, nright_context=5):
+        """ init construction
+            Args:
+                nleft_context(int):
+                nright_context(int):
+        """
+        self._nleft_context = nleft_context
+        self._nright_context = nright_context
+    def perform_trans(self, sample):
+        """ copy feature context 
+        Args:
+            sample(object): input sample(feature, label)
+        Return:
+            (feature, label)
+        """
+        (feature, label) = sample
+        nframe_num = feature.shape[0]
+        nframe_dim = feature.shape[1]
+        nnew_frame_dim = nframe_dim * (
+            self._nleft_context + self._nright_context + 1)
+        mat = np.zeros(
+            (nframe_num + self._nleft_context + self._nright_context,
+             nframe_dim),
+            dtype="float32")
+        ret = np.zeros((nframe_num, nnew_frame_dim), dtype="float32")
+        #copy left
+        for i in xrange(self._nleft_context):
+            mat[i, :] = feature[0, :]
+        #copy middle 
+        mat[self._nleft_context:self._nleft_context +
+            nframe_num, :] = feature[:, :]
+        #copy right
+        for i in xrange(self._nright_context):
+            mat[i + self._nleft_context + nframe_num, :] = feature[-1, :]
+        mat = mat.reshape(mat.shape[0] * mat.shape[1])
+        ret = ret.reshape(ret.shape[0] * ret.shape[1])
+        for i in xrange(nframe_num):
+            np.copyto(ret[i * nnew_frame_dim:(i + 1) * nnew_frame_dim],
+                      mat[i * nframe_dim:i * nframe_dim + nnew_frame_dim])
+        ret = ret.reshape((nframe_num, nnew_frame_dim))
+        return (ret, label)
--- a/fluid/DeepASR/data_utils/data_reader.py
+++ b/fluid/DeepASR/data_utils/data_reader.py
+"""This module contains data processing related logic.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import random
+import struct
+import Queue
+import time
+import numpy as np
+from threading import Thread
+import signal
+from multiprocessing import Manager, Process
+import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
+import data_utils.augmentor.trans_add_delta as trans_add_delta
+from data_utils.util import suppress_complaints, suppress_signal
+class SampleInfo(object):
+    """SampleInfo holds the necessary information to load a sample from disk.
+    Args:
+        feature_bin_path (str): File containing the feature data.
+        feature_start (int): Start position of the sample's feature data.
+        feature_size (int): Byte count of the sample's feature data.
+        feature_frame_num (int): Time length of the sample.
+        feature_dim (int): Feature dimension of one frame.
+        label_bin_path (str): File containing the label data.
+        label_size (int): Byte count of the sample's label data. 
+        label_frame_num (int): Label number of the sample.
+    """
+    def __init__(self, feature_bin_path, feature_start, feature_size,
+                 feature_frame_num, feature_dim, label_bin_path, label_start,
+                 label_size, label_frame_num):
+        self.feature_bin_path = feature_bin_path
+        self.feature_start = feature_start
+        self.feature_size = feature_size
+        self.feature_frame_num = feature_frame_num
+        self.feature_dim = feature_dim
+        self.label_bin_path = label_bin_path
+        self.label_start = label_start
+        self.label_size = label_size
+        self.label_frame_num = label_frame_num
+class SampleInfoBucket(object):
+    """SampleInfoBucket contains paths of several description files. Feature
+    description file contains necessary information (including path of binary 
+    data, sample start position, sample byte number etc.) to access samples' 
+    feature data and the same with the label description file. SampleInfoBucket 
+    is the minimum unit to do shuffle.
+    Args:
+        feature_bin_paths (list|tuple): Files containing the binary feature 
+                                        data.
+        feature_desc_paths (list|tuple): Files containing the description of 
+                                         samples' feature data. 
+        label_bin_paths (list|tuple): Files containing the binary label data.
+        label_desc_paths (list|tuple): Files containing the description of
+                                       samples' label data.
+    """
+    def __init__(self, feature_bin_paths, feature_desc_paths, label_bin_paths,
+                 label_desc_paths):
+        block_num = len(label_bin_paths)
+        assert len(label_desc_paths) == block_num
+        assert len(feature_bin_paths) == block_num
+        assert len(feature_desc_paths) == block_num
+        self._block_num = block_num
+        self._feature_bin_paths = feature_bin_paths
+        self._feature_desc_paths = feature_desc_paths
+        self._label_bin_paths = label_bin_paths
+        self._label_desc_paths = label_desc_paths
+    def generate_sample_info_list(self):
+        sample_info_list = []
+        for block_idx in xrange(self._block_num):
+            label_bin_path = self._label_bin_paths[block_idx]
+            label_desc_path = self._label_desc_paths[block_idx]
+            feature_bin_path = self._feature_bin_paths[block_idx]
+            feature_desc_path = self._feature_desc_paths[block_idx]
+            label_desc_lines = open(label_desc_path).readlines()
+            feature_desc_lines = open(feature_desc_path).readlines()
+            sample_num = int(label_desc_lines[0].split()[1])
+            assert sample_num == int(feature_desc_lines[0].split()[1])
+            for i in xrange(sample_num):
+                feature_desc_split = feature_desc_lines[i + 1].split()
+                feature_start = int(feature_desc_split[2])
+                feature_size = int(feature_desc_split[3])
+                feature_frame_num = int(feature_desc_split[4])
+                feature_dim = int(feature_desc_split[5])
+                label_desc_split = label_desc_lines[i + 1].split()
+                label_start = int(label_desc_split[2])
+                label_size = int(label_desc_split[3])
+                label_frame_num = int(label_desc_split[4])
+                sample_info_list.append(
+                    SampleInfo(feature_bin_path, feature_start, feature_size,
+                               feature_frame_num, feature_dim, label_bin_path,
+                               label_start, label_size, label_frame_num))
+        return sample_info_list
+class EpochEndSignal():
+    pass
+class DataReader(object):
+    """DataReader provides basic audio sample preprocessing pipeline including
+    data loading and data augmentation.
+    Args:
+        feature_file_list (str): File containing paths of feature data file and
+                                 corresponding description file.
+        label_file_list (str): File containing paths of label data file and 
+                               corresponding description file.
+        drop_frame_len (int): Samples whose label length above the value will be
+                              dropped.
+        process_num (int): Number of processes for processing data.
+        sample_buffer_size (int): Buffer size to indicate the maximum samples 
+                                  cached.
+        sample_info_buffer_size (int): Buffer size to indicate the maximum 
+                                       sample information cached.
+        batch_buffer_size (int): Buffer size to indicate the maximum batch 
+                                 cached.
+        shuffle_block_num (int): Block number indicating the minimum unit to do 
+                                 shuffle.
+        random_seed (int): Random seed.
+        verbose (int): If set to 0, complaints including exceptions and signal 
+                       traceback from sub-process will be suppressed. If set 
+                       to 1, all complaints will be printed.
+    """
+    def __init__(self,
+                 feature_file_list,
+                 label_file_list,
+                 drop_frame_len=512,
+                 process_num=10,
+                 sample_buffer_size=1024,
+                 sample_info_buffer_size=1024,
+                 batch_buffer_size=1024,
+                 shuffle_block_num=1,
+                 random_seed=0,
+                 verbose=0):
+        self._feature_file_list = feature_file_list
+        self._label_file_list = label_file_list
+        self._drop_frame_len = drop_frame_len
+        self._shuffle_block_num = shuffle_block_num
+        self._block_info_list = None
+        self._rng = random.Random(random_seed)
+        self._bucket_list = None
+        self.generate_bucket_list(True)
+        self._order_id = 0
+        self._manager = Manager()
+        self._sample_buffer_size = sample_buffer_size
+        self._sample_info_buffer_size = sample_info_buffer_size
+        self._batch_buffer_size = batch_buffer_size
+        self._process_num = process_num
+        self._verbose = verbose
+    def generate_bucket_list(self, is_shuffle):
+        if self._block_info_list is None:
+            block_feature_info_lines = open(self._feature_file_list).readlines()
+            block_label_info_lines = open(self._label_file_list).readlines()
+            assert len(block_feature_info_lines) == len(block_label_info_lines)
+            self._block_info_list = []
+            for i in xrange(0, len(block_feature_info_lines), 2):
+                block_info = (block_feature_info_lines[i],
+                              block_feature_info_lines[i + 1],
+                              block_label_info_lines[i],
+                              block_label_info_lines[i + 1])
+                self._block_info_list.append(
+                    map(lambda line: line.strip(), block_info))
+        if is_shuffle:
+            self._rng.shuffle(self._block_info_list)
+        self._bucket_list = []
+        for i in xrange(0, len(self._block_info_list), self._shuffle_block_num):
+            bucket_block_info = self._block_info_list[i:i +
+                                                      self._shuffle_block_num]
+            self._bucket_list.append(
+                SampleInfoBucket(
+                    map(lambda info: info[0], bucket_block_info),
+                    map(lambda info: info[1], bucket_block_info),
+                    map(lambda info: info[2], bucket_block_info),
+                    map(lambda info: info[3], bucket_block_info)))
+    # @TODO make this configurable
+    def set_transformers(self, transformers):
+        self._transformers = transformers
+    def _sample_generator(self):
+        sample_info_queue = self._manager.Queue(self._sample_info_buffer_size)
+        sample_queue = self._manager.Queue(self._sample_buffer_size)
+        self._order_id = 0
+        @suppress_complaints(verbose=self._verbose)
+        def ordered_feeding_task(sample_info_queue):
+            for sample_info_bucket in self._bucket_list:
+                sample_info_list = sample_info_bucket.generate_sample_info_list(
+                )
+                self._rng.shuffle(sample_info_list)  # do shuffle here
+                for sample_info in sample_info_list:
+                    sample_info_queue.put((sample_info, self._order_id))
+                    self._order_id += 1
+            for i in xrange(self._process_num):
+                sample_info_queue.put(EpochEndSignal())
+        feeding_thread = Thread(
+            target=ordered_feeding_task, args=(sample_info_queue, ))
+        feeding_thread.daemon = True
+        feeding_thread.start()
+        @suppress_complaints(verbose=self._verbose)
+        def ordered_processing_task(sample_info_queue, sample_queue, out_order):
+            if self._verbose == 0:
+                signal.signal(signal.SIGTERM, suppress_signal)
+                signal.signal(signal.SIGINT, suppress_signal)
+            def read_bytes(fpath, start, size):
+                f = open(fpath, 'r')
+                f.seek(start, 0)
+                binary_bytes = f.read(size)
+                f.close()
+                return binary_bytes
+            ins = sample_info_queue.get()
+            while not isinstance(ins, EpochEndSignal):
+                sample_info, order_id = ins
+                feature_bytes = read_bytes(sample_info.feature_bin_path,
+                                           sample_info.feature_start,
+                                           sample_info.feature_size)
+                label_bytes = read_bytes(sample_info.label_bin_path,
+                                         sample_info.label_start,
+                                         sample_info.label_size)
+                assert sample_info.label_frame_num * 4 == len(label_bytes)
+                label_array = struct.unpack('I' * sample_info.label_frame_num,
+                                            label_bytes)
+                label_data = np.array(
+                    label_array, dtype='int64').reshape(
+                        (sample_info.label_frame_num, 1))
+                feature_frame_num = sample_info.feature_frame_num
+                feature_dim = sample_info.feature_dim
+                assert feature_frame_num * feature_dim * 4 == len(feature_bytes)
+                feature_array = struct.unpack('f' * feature_frame_num *
+                                              feature_dim, feature_bytes)
+                feature_data = np.array(
+                    feature_array, dtype='float32').reshape((
+                        sample_info.feature_frame_num, sample_info.feature_dim))
+                sample_data = (feature_data, label_data)
+                for transformer in self._transformers:
+                    # @TODO(pkuyym) to make transfomer only accept feature_data
+                    sample_data = transformer.perform_trans(sample_data)
+                while order_id != out_order[0]:
+                    time.sleep(0.001)
+                # drop long sentence
+                if self._drop_frame_len >= sample_data[0].shape[0]:
+                    sample_queue.put(sample_data)
+                out_order[0] += 1
+                ins = sample_info_queue.get()
+            sample_queue.put(EpochEndSignal())
+        out_order = self._manager.list([0])
+        args = (sample_info_queue, sample_queue, out_order)
+        workers = [
+            Process(
+                target=ordered_processing_task, args=args)
+            for _ in xrange(self._process_num)
+        ]
+        for w in workers:
+            w.daemon = True
+            w.start()
+        finished_process_num = 0
+        while finished_process_num < self._process_num:
+            sample = sample_queue.get()
+            if isinstance(sample, EpochEndSignal):
+                finished_process_num += 1
+                continue
+            yield sample
+        feeding_thread.join()
+        for w in workers:
+            w.join()
+    def batch_iterator(self, batch_size, minimum_batch_size):
+        def batch_to_ndarray(batch_samples, lod):
+            assert len(batch_samples)
+            frame_dim = batch_samples[0][0].shape[1]
+            batch_feature = np.zeros((lod[-1], frame_dim), dtype="float32")
+            batch_label = np.zeros((lod[-1], 1), dtype="int64")
+            start = 0
+            for sample in batch_samples:
+                frame_num = sample[0].shape[0]
+                batch_feature[start:start + frame_num, :] = sample[0]
+                batch_label[start:start + frame_num, :] = sample[1]
+                start += frame_num
+            return (batch_feature, batch_label)
+        @suppress_complaints(verbose=self._verbose)
+        def batch_assembling_task(sample_generator, batch_queue):
+            batch_samples = []
+            lod = [0]
+            for sample in sample_generator():
+                batch_samples.append(sample)
+                lod.append(lod[-1] + sample[0].shape[0])
+                if len(batch_samples) == batch_size:
+                    (batch_feature, batch_label) = batch_to_ndarray(
+                        batch_samples, lod)
+                    batch_queue.put((batch_feature, batch_label, lod))
+                    batch_samples = []
+                    lod = [0]
+            if len(batch_samples) >= minimum_batch_size:
+                (batch_feature, batch_label) = batch_to_ndarray(batch_samples,
+                                                                lod)
+                batch_queue.put((batch_feature, batch_label, lod))
+            batch_queue.put(EpochEndSignal())
+        batch_queue = Queue.Queue(self._batch_buffer_size)
+        assembling_thread = Thread(
+            target=batch_assembling_task,
+            args=(self._sample_generator, batch_queue))
+        assembling_thread.daemon = True
+        assembling_thread.start()
+        while True:
+            try:
+                batch_data = batch_queue.get_nowait()
+            except Queue.Empty:
+                time.sleep(0.001)
+            else:
+                if isinstance(batch_data, EpochEndSignal):
+                    break
+                yield batch_data
+        assembling_thread.join()
--- a/fluid/DeepASR/data_utils/util.py
+++ b/fluid/DeepASR/data_utils/util.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+from six import reraise
+from tblib import Traceback
+import numpy as np
+def to_lodtensor(data, place):
+    """convert tensor to lodtensor
+    """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+def lodtensor_to_ndarray(lod_tensor):
+    """conver lodtensor to ndarray
+    """
+    dims = lod_tensor.get_dims()
+    ret = np.zeros(shape=dims).astype('float32')
+    for i in xrange(np.product(dims)):
+        ret.ravel()[i] = lod_tensor.get_float_element(i)
+    return ret, lod_tensor.lod()
+def suppress_signal(signo, stack_frame):
+    pass
+def suppress_complaints(verbose):
+    def decorator_maker(func):
+        def suppress_warpper(*args, **kwargs):
+            try:
+                func(*args, **kwargs)
+            except:
+                et, ev, tb = sys.exc_info()
+                tb = Traceback(tb)
+                if verbose == 1:
+                    reraise(et, ev, tb.as_traceback())
+        return suppress_warpper
+    return decorator_maker
--- a/fluid/DeepASR/model_utils/model.py
+++ b/fluid/DeepASR/model_utils/model.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+def stacked_lstmp_model(hidden_dim,
+                        proj_dim,
+                        stacked_num,
+                        class_num,
+                        parallel=False,
+                        is_train=True):
+    """ The model for DeepASR. The main structure is composed of stacked 
+        identical LSTMP (LSTM with recurrent projection) layers.
+        When running in training and validation phase, the feeding dictionary
+        is {'feature', 'label'}, fed by the LodTensor for feature data and 
+        label data respectively. And in inference, only `feature` is needed.
+    Args:
+	hidden_dim(int): The hidden state's dimension of the LSTMP layer.
+	proj_dim(int): The projection size of the LSTMP layer.
+	stacked_num(int): The number of stacked LSTMP layers.
+	parallel(bool): Run in parallel or not, default `False`.
+	is_train(bool): Run in training phase or not, default `True`.
+	class_dim(int): The number of output classes.
+    """
+    # network configuration
+    def _net_conf(feature, label):
+        seq_conv1 = fluid.layers.sequence_conv(
+            input=feature,
+            num_filters=1024,
+            filter_size=3,
+            filter_stride=1,
+            bias_attr=True)
+        bn1 = fluid.layers.batch_norm(
+            input=seq_conv1,
+            act="sigmoid",
+            is_test=not is_train,
+            momentum=0.9,
+            epsilon=1e-05,
+            data_layout='NCHW')
+        stack_input = bn1
+        for i in range(stacked_num):
+            fc = fluid.layers.fc(input=stack_input,
+                                 size=hidden_dim * 4,
+                                 bias_attr=True)
+            proj, cell = fluid.layers.dynamic_lstmp(
+                input=fc,
+                size=hidden_dim * 4,
+                proj_size=proj_dim,
+                bias_attr=True,
+                use_peepholes=True,
+                is_reverse=False,
+                cell_activation="tanh",
+                proj_activation="tanh")
+            bn = fluid.layers.batch_norm(
+                input=proj,
+                act="sigmoid",
+                is_test=not is_train,
+                momentum=0.9,
+                epsilon=1e-05,
+                data_layout='NCHW')
+            stack_input = bn
+        prediction = fluid.layers.fc(input=stack_input,
+                                     size=class_num,
+                                     act='softmax')
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=prediction, label=label)
+        return prediction, avg_cost, acc
+    # data feeder
+    feature = fluid.layers.data(
+        name="feature", shape=[-1, 120 * 11], dtype="float32", lod_level=1)
+    label = fluid.layers.data(
+        name="label", shape=[-1, 1], dtype="int64", lod_level=1)
+    if parallel:
+        # When the execution place is specified to CUDAPlace, the program will
+        # run on all $CUDA_VISIBLE_DEVICES GPUs. Otherwise the program will 
+        # run on all CPU devices.
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            feat_ = pd.read_input(feature)
+            label_ = pd.read_input(label)
+            prediction, avg_cost, acc = _net_conf(feat_, label_)
+            for out in [avg_cost, acc]:
+                pd.write_output(out)
+        # get mean loss and acc through every devices.
+        avg_cost, acc = pd()
+        avg_cost = fluid.layers.mean(x=avg_cost)
+        acc = fluid.layers.mean(x=acc)
+    else:
+        prediction, avg_cost, acc = _net_conf(feature, label)
+    return prediction, avg_cost, acc
--- a/fluid/DeepASR/tools/_init_paths.py
+++ b/fluid/DeepASR/tools/_init_paths.py
+"""Add the parent directory to $PYTHONPATH"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os.path
+import sys
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+this_dir = os.path.dirname(__file__)
+# Add project path to PYTHONPATH
+proj_path = os.path.join(this_dir, '..')
+add_path(proj_path)
--- a/fluid/DeepASR/tools/profile.py
+++ b/fluid/DeepASR/tools/profile.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import numpy as np
+import argparse
+import time
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.profiler as profiler
+import _init_paths
+import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
+import data_utils.augmentor.trans_add_delta as trans_add_delta
+import data_utils.augmentor.trans_splice as trans_splice
+import data_utils.data_reader as reader
+from model_utils.model import stacked_lstmp_model
+from data_utils.util import lodtensor_to_ndarray
+def parse_args():
+    parser = argparse.ArgumentParser("Profiling for the stacked LSTMP model.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--minimum_batch_size',
+        type=int,
+        default=1,
+        help='The minimum sequence number of a batch data. '
+        '(default: %(default)d)')
+    parser.add_argument(
+        '--stacked_num',
+        type=int,
+        default=5,
+        help='Number of lstmp layers to stack. (default: %(default)d)')
+    parser.add_argument(
+        '--proj_dim',
+        type=int,
+        default=512,
+        help='Project size of lstmp unit. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=1024,
+        help='Hidden size of lstmp unit. (default: %(default)d)')
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=0.002,
+        help='Learning rate used to train. (default: %(default)f)')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type. (default: %(default)s)')
+    parser.add_argument(
+        '--parallel', action='store_true', help='If set, run in parallel.')
+    parser.add_argument(
+        '--mean_var',
+        type=str,
+        default='data/global_mean_var_search26kHr',
+        help='mean var path')
+    parser.add_argument(
+        '--feature_lst',
+        type=str,
+        default='data/feature.lst',
+        help='feature list path.')
+    parser.add_argument(
+        '--label_lst',
+        type=str,
+        default='data/label.lst',
+        help='label list path.')
+    parser.add_argument(
+        '--max_batch_num',
+        type=int,
+        default=10,
+        help='Maximum number of batches for profiling. (default: %(default)d)')
+    parser.add_argument(
+        '--first_batches_to_skip',
+        type=int,
+        default=1,
+        help='Number of first batches to skip for profiling. '
+        '(default: %(default)d)')
+    parser.add_argument(
+        '--print_train_acc',
+        action='store_true',
+        help='If set, output training accuray.')
+    parser.add_argument(
+        '--sorted_key',
+        type=str,
+        default='total',
+        choices=['None', 'total', 'calls', 'min', 'max', 'ave'],
+        help='Different types of time to sort the profiling report. '
+        '(default: %(default)s)')
+    args = parser.parse_args()
+    return args
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+def profile(args):
+    """profile the training process.
+    """
+    if not args.first_batches_to_skip < args.max_batch_num:
+        raise ValueError("arg 'first_batches_to_skip' must be smaller than "
+                         "'max_batch_num'.")
+    if not args.first_batches_to_skip >= 0:
+        raise ValueError(
+            "arg 'first_batches_to_skip' must not be smaller than 0.")
+    _, avg_cost, accuracy = stacked_lstmp_model(
+        hidden_dim=args.hidden_dim,
+        proj_dim=args.proj_dim,
+        stacked_num=args.stacked_num,
+        class_num=1749,
+        parallel=args.parallel)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    adam_optimizer.minimize(avg_cost)
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    ltrans = [
+        trans_add_delta.TransAddDelta(2, 2),
+        trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var),
+        trans_splice.TransSplice()
+    ]
+    data_reader = reader.DataReader(args.feature_lst, args.label_lst)
+    data_reader.set_transformers(ltrans)
+    feature_t = fluid.LoDTensor()
+    label_t = fluid.LoDTensor()
+    sorted_key = None if args.sorted_key is 'None' else args.sorted_key
+    with profiler.profiler(args.device, sorted_key) as prof:
+        frames_seen, start_time = 0, 0.0
+        for batch_id, batch_data in enumerate(
+                data_reader.batch_iterator(args.batch_size,
+                                           args.minimum_batch_size)):
+            if batch_id >= args.max_batch_num:
+                break
+            if args.first_batches_to_skip == batch_id:
+                profiler.reset_profiler()
+                start_time = time.time()
+                frames_seen = 0
+            # load_data
+            (features, labels, lod) = batch_data
+            feature_t.set(features, place)
+            feature_t.set_lod([lod])
+            label_t.set(labels, place)
+            label_t.set_lod([lod])
+            frames_seen += lod[-1]
+            outs = exe.run(fluid.default_main_program(),
+                           feed={"feature": feature_t,
+                                 "label": label_t},
+                           fetch_list=[avg_cost, accuracy],
+                           return_numpy=False)
+            if args.print_train_acc:
+                print("Batch %d acc: %f" %
+                      (batch_id, lodtensor_to_ndarray(outs[1])[0]))
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        time_consumed = time.time() - start_time
+        frames_per_sec = frames_seen / time_consumed
+        print("\nTime consumed: %f s, performance: %f frames/s." %
+              (time_consumed, frames_per_sec))
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    profile(args)
--- a/fluid/DeepASR/train.py
+++ b/fluid/DeepASR/train.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import os
+import numpy as np
+import argparse
+import time
+import paddle.v2.fluid as fluid
+import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
+import data_utils.augmentor.trans_add_delta as trans_add_delta
+import data_utils.augmentor.trans_splice as trans_splice
+import data_utils.data_reader as reader
+from data_utils.util import lodtensor_to_ndarray
+from model_utils.model import stacked_lstmp_model
+def parse_args():
+    parser = argparse.ArgumentParser("Training for stacked LSTMP model.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--minimum_batch_size',
+        type=int,
+        default=1,
+        help='The minimum sequence number of a batch data. '
+        '(default: %(default)d)')
+    parser.add_argument(
+        '--stacked_num',
+        type=int,
+        default=5,
+        help='Number of lstm layers to stack. (default: %(default)d)')
+    parser.add_argument(
+        '--proj_dim',
+        type=int,
+        default=512,
+        help='Project size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=1024,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=100,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--print_per_batches',
+        type=int,
+        default=100,
+        help='Interval to print training accuracy. (default: %(default)d)')
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=0.002,
+        help='Learning rate used to train. (default: %(default)f)')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type. (default: %(default)s)')
+    parser.add_argument(
+        '--parallel', action='store_true', help='If set, run in parallel.')
+    parser.add_argument(
+        '--mean_var',
+        type=str,
+        default='data/global_mean_var_search26kHr',
+        help='mean var path')
+    parser.add_argument(
+        '--train_feature_lst',
+        type=str,
+        default='data/feature.lst',
+        help='feature list path for training.')
+    parser.add_argument(
+        '--train_label_lst',
+        type=str,
+        default='data/label.lst',
+        help='label list path for training.')
+    parser.add_argument(
+        '--val_feature_lst',
+        type=str,
+        default='data/val_feature.lst',
+        help='feature list path for validation.')
+    parser.add_argument(
+        '--val_label_lst',
+        type=str,
+        default='data/val_label.lst',
+        help='label list path for validation.')
+    parser.add_argument(
+        '--model_save_dir',
+        type=str,
+        default='./checkpoints',
+        help='directory to save model. Do not save model if set to '
+        '.')
+    args = parser.parse_args()
+    return args
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+def train(args):
+    """train in loop.
+    """
+    # prediction, avg_cost, accuracy = stacked_lstmp_model(args.hidden_dim, 
+    #    args.proj_dim, args.stacked_num, class_num=1749, args.parallel)
+    prediction, avg_cost, accuracy = stacked_lstmp_model(
+        hidden_dim=args.hidden_dim,
+        proj_dim=args.proj_dim,
+        stacked_num=args.stacked_num,
+        class_num=1749,
+        parallel=args.parallel)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    adam_optimizer.minimize(avg_cost)
+    # program for test
+    test_program = fluid.default_main_program().clone()
+    with fluid.program_guard(test_program):
+        test_program = fluid.io.get_inference_program([avg_cost, accuracy])
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    ltrans = [
+        trans_add_delta.TransAddDelta(2, 2),
+        trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var),
+        trans_splice.TransSplice()
+    ]
+    feature_t = fluid.LoDTensor()
+    label_t = fluid.LoDTensor()
+    # validation
+    def test(exe):
+        # If test data not found, return invalid cost and accuracy
+        if not (os.path.exists(args.val_feature_lst) and
+                os.path.exists(args.val_label_lst)):
+            return -1.0, -1.0
+        # test data reader
+        test_data_reader = reader.DataReader(args.val_feature_lst,
+                                             args.val_label_lst)
+        test_data_reader.set_transformers(ltrans)
+        test_costs, test_accs = [], []
+        for batch_id, batch_data in enumerate(
+                test_data_reader.batch_iterator(args.batch_size,
+                                                args.minimum_batch_size)):
+            # load_data
+            (features, labels, lod) = batch_data
+            feature_t.set(features, place)
+            feature_t.set_lod([lod])
+            label_t.set(labels, place)
+            label_t.set_lod([lod])
+            cost, acc = exe.run(test_program,
+                                feed={"feature": feature_t,
+                                      "label": label_t},
+                                fetch_list=[avg_cost, accuracy],
+                                return_numpy=False)
+            test_costs.append(lodtensor_to_ndarray(cost)[0])
+            test_accs.append(lodtensor_to_ndarray(acc)[0])
+        return np.mean(test_costs), np.mean(test_accs)
+    # train data reader
+    train_data_reader = reader.DataReader(args.train_feature_lst,
+                                          args.train_label_lst)
+    train_data_reader.set_transformers(ltrans)
+    # train
+    for pass_id in xrange(args.pass_num):
+        pass_start_time = time.time()
+        for batch_id, batch_data in enumerate(
+                train_data_reader.batch_iterator(args.batch_size,
+                                                 args.minimum_batch_size)):
+            # load_data
+            (features, labels, lod) = batch_data
+            feature_t.set(features, place)
+            feature_t.set_lod([lod])
+            label_t.set(labels, place)
+            label_t.set_lod([lod])
+            cost, acc = exe.run(fluid.default_main_program(),
+                                feed={"feature": feature_t,
+                                      "label": label_t},
+                                fetch_list=[avg_cost, accuracy],
+                                return_numpy=False)
+            if batch_id > 0 and (batch_id % args.print_per_batches == 0):
+                print("\nBatch %d, train cost: %f, train acc: %f" %
+                      (batch_id, lodtensor_to_ndarray(cost)[0],
+                       lodtensor_to_ndarray(acc)[0]))
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        # run test
+        val_cost, val_acc = test(exe)
+        # save model 
+        if args.model_save_dir != '':
+            model_path = os.path.join(
+                args.model_save_dir, "deep_asr.pass_" + str(pass_id) + ".model")
+            fluid.io.save_inference_model(model_path, ["feature"],
+                                          [prediction], exe)
+        # cal pass time
+        pass_end_time = time.time()
+        time_consumed = pass_end_time - pass_start_time
+        # print info at pass end
+        print("\nPass %d, time consumed: %f s, val cost: %f, val acc: %f\n" %
+              (pass_id, time_consumed, val_cost, val_acc))
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if args.model_save_dir != '' and not os.path.exists(args.model_save_dir):
+        os.mkdir(args.model_save_dir)
+    train(args)
--- a/fluid/adversarial/advbox/__init__.py
+++ b/fluid/adversarial/advbox/__init__.py
 """
-   A set of tools for generating adversarial example on paddle platform 
+   A set of tools for generating adversarial example on paddle platform
 """
+from . import attacks
+from . import models
+from .adversary import Adversary
--- a/fluid/adversarial/advbox/adversary.py
+++ b/fluid/adversarial/advbox/adversary.py
+"""
+Defines a class that contains the original object, the target and the
+adversarial example.
+"""
+class Adversary(object):
+    """
+    Adversary contains the original object, the target and the adversarial
+    example.
+    """
+    def __init__(self, original, original_label=None):
+        """
+        :param original: The original instance, such as an image.
+        :param original_label: The original instance's label.
+        """
+        assert original is not None
+        self.__original = original
+        self.__original_label = original_label
+        self.__target_label = None
+        self.__target = None
+        self.__is_targeted_attack = False
+        self.__adversarial_example = None
+        self.__adversarial_label = None
+    def set_target(self, is_targeted_attack, target=None, target_label=None):
+        """
+        Set the target be targeted or untargeted.
+        :param is_targeted_attack: bool
+        :param target: The target.
+        :param target_label: If is_targeted_attack is true and target_label is
+                    None, self.target_label will be set by the Attack class.
+                    If is_targeted_attack is false, target_label must be None.
+        """
+        assert (target_label is None) or is_targeted_attack
+        self.__is_targeted_attack = is_targeted_attack
+        self.__target_label = target_label
+        self.__target = target
+        if not is_targeted_attack:
+            self.__target_label = None
+            self.__target = None
+    def set_original(self, original, original_label=None):
+        """
+        Reset the original.
+        :param original: Original instance.
+        :param original_label: Original instance's label.
+        """
+        if original != self.__original:
+            self.__original = original
+            self.__original_label = original_label
+            self.__adversarial_example = None
+        if original is None:
+            self.__original_label = None
+    def _is_successful(self, adversarial_label):
+        """
+        Is the adversarial_label is the expected adversarial label.
+        :param adversarial_label: adversarial label.
+        :return: bool
+        """
+        if self.__target_label is not None:
+            return adversarial_label == self.__target_label
+        else:
+            return (adversarial_label is not None) and \
+                   (adversarial_label != self.__original_label)
+    def is_successful(self):
+        """
+        Has the adversarial example been found.
+        :return: bool
+        """
+        return self._is_successful(self.__adversarial_label)
+    def try_accept_the_example(self, adversarial_example, adversarial_label):
+        """
+        If adversarial_label the target label that we are finding.
+        The adversarial_example and adversarial_label will be accepted and
+        True will be returned.
+        :return: bool
+        """
+        assert adversarial_example is not None
+        assert self.__original.shape == adversarial_example.shape
+        ok = self._is_successful(adversarial_label)
+        if ok:
+            self.__adversarial_example = adversarial_example
+            self.__adversarial_label = adversarial_label
+        return ok
+    def perturbation(self, multiplying_factor=1.0):
+        """
+        The perturbation that the adversarial_example is added.
+        :param multiplying_factor: float.
+        :return: The perturbation that is multiplied by multiplying_factor.
+        """
+        assert self.__original is not None
+        assert self.__adversarial_example is not None
+        return multiplying_factor * (
+            self.__adversarial_example - self.__original)
+    @property
+    def is_targeted_attack(self):
+        """
+        :property: is_targeted_attack
+        """
+        return self.__is_targeted_attack
+    @property
+    def target_label(self):
+        """
+        :property: target_label
+        """
+        return self.__target_label
+    @target_label.setter
+    def target_label(self, label):
+        """
+        :property: target_label
+        """
+        self.__target_label = label
+    @property
+    def target(self):
+        """
+        :property: target
+        """
+        return self.__target
+    @property
+    def original(self):
+        """
+        :property: original
+        """
+        return self.__original
+    @property
+    def original_label(self):
+        """
+        :property: original
+        """
+        return self.__original_label
+    @original_label.setter
+    def original_label(self, label):
+        """
+        original_label setter
+        """
+        self.__original_label = label
+    @property
+    def adversarial_example(self):
+        """
+        :property: adversarial_example
+        """
+        return self.__adversarial_example
+    @adversarial_example.setter
+    def adversarial_example(self, example):
+        """
+        adversarial_example setter
+        """
+        self.__adversarial_example = example
+    @property
+    def adversarial_label(self):
+        """
+        :property: adversarial_label
+        """
+        return self.__adversarial_label
+    @adversarial_label.setter
+    def adversarial_label(self, label):
+        """
+        adversarial_label setter
+        """
+        self.__adversarial_label = label
--- a/fluid/adversarial/advbox/attacks/__init__.py
+++ b/fluid/adversarial/advbox/attacks/__init__.py
+"""
+Attack methods
+"""
+from .base import Attack
+from .deepfool import DeepFoolAttack
+from .gradientsign import FGSM
+from .gradientsign import GradientSignAttack
+from .iterator_gradientsign import IFGSM
+from .iterator_gradientsign import IteratorGradientSignAttack
--- a/fluid/adversarial/advbox/attacks/base.py
+++ b/fluid/adversarial/advbox/attacks/base.py
 """
 The base model of the model.
 """
-from abc import ABCMeta, abstractmethod
+import logging
+from abc import ABCMeta
+from abc import abstractmethod
+import numpy as np
 class Attack(object):
    """
-    Abstract base class for adversarial attacks. `Attack` represent an adversarial attack
+    Abstract base class for adversarial attacks. `Attack` represent an
-    which search an adversarial example. subclass should implement the _apply() method.
+    adversarial attack which search an adversarial example. subclass should
+    implement the _apply() method.
    Args:
        model(Model): an instance of the class advbox.base.Model.
@@ -18,22 +23,50 @@ class Attack(object):
    def __init__(self, model):
        self.model = model
-    def __call__(self, image_label):
+    def __call__(self, adversary, **kwargs):
        """
        Generate the adversarial sample.
        Args:
-        image_label(list): The image and label tuple list with one element.
+        adversary(object): The adversary object.
+        **kwargs: Other named arguments.
        """
-        adv_img = self._apply(image_label)
+        self._preprocess(adversary)
-        return adv_img
+        return self._apply(adversary, **kwargs)
    @abstractmethod
-    def _apply(self, image_label):
+    def _apply(self, adversary, **kwargs):
        """
        Search an adversarial example.
        Args:
-        image_batch(list): The image and label tuple list with one element.
+        adversary(object): The adversary object.
+        **kwargs: Other named arguments.
        """
        raise NotImplementedError
+    def _preprocess(self, adversary):
+        """
+        Preprocess the adversary object.
+        :param adversary: adversary
+        :return: None
+        """
+        if adversary.original_label is None:
+            adversary.original_label = np.argmax(
+                self.model.predict(adversary.original))
+        if adversary.is_targeted_attack and adversary.target_label is None:
+            if adversary.target is None:
+                raise ValueError(
+                    'When adversary.is_targeted_attack is True, '
+                    'adversary.target_label or adversary.target must be set.')
+            else:
+                adversary.target_label_label = np.argmax(
+                    self.model.predict(
+                        self.model.scale_input(adversary.target)))
+        logging.info('adversary:\noriginal_label: {}'
+                     '\n          target_lable: {}'
+                     '\n          is_targeted_attack: {}'
+                     ''.format(adversary.original_label, adversary.target_label,
+                               adversary.is_targeted_attack))
--- a/fluid/adversarial/advbox/attacks/deepfool.py
+++ b/fluid/adversarial/advbox/attacks/deepfool.py
+"""
+This module provide the attack method for deepfool. Deepfool is a simple and
+accurate adversarial attack.
+"""
+from __future__ import division
+import logging
+import numpy as np
+from .base import Attack
+class DeepFoolAttack(Attack):
+    """
+    DeepFool: a simple and accurate method to fool deep neural networks",
+    Seyed-Mohsen Moosavi-Dezfooli, Alhussein Fawzi, Pascal Frossard,
+    https://arxiv.org/abs/1511.04599
+    """
+    def _apply(self, adversary, iterations=100, overshoot=0.02):
+        """
+          Apply the deep fool attack.
+          Args:
+              adversary(Adversary): The Adversary object.
+              iterations(int): The iterations.
+              overshoot(float): We add (1+overshoot)*pert every iteration.
+          Return:
+              adversary: The Adversary object.
+          """
+        assert adversary is not None
+        pre_label = adversary.original_label
+        min_, max_ = self.model.bounds()
+        f = self.model.predict(adversary.original)
+        if adversary.is_targeted_attack:
+            labels = [adversary.target_label]
+        else:
+            max_class_count = 10
+            class_count = self.model.num_classes()
+            if class_count > max_class_count:
+                labels = np.argsort(f)[-(max_class_count + 1):-1]
+            else:
+                labels = np.arange(class_count)
+        gradient = self.model.gradient(adversary.original, pre_label)
+        x = adversary.original
+        for iteration in xrange(iterations):
+            w = np.inf
+            w_norm = np.inf
+            pert = np.inf
+            for k in labels:
+                if k == pre_label:
+                    continue
+                gradient_k = self.model.gradient(x, k)
+                w_k = gradient_k - gradient
+                f_k = f[k] - f[pre_label]
+                w_k_norm = np.linalg.norm(w_k) + 1e-8
+                pert_k = (np.abs(f_k) + 1e-8) / w_k_norm
+                if pert_k < pert:
+                    pert = pert_k
+                    w = w_k
+                    w_norm = w_k_norm
+            r_i = -w * pert / w_norm  # The gradient is -gradient in the paper.
+            x = x + (1 + overshoot) * r_i
+            x = np.clip(x, min_, max_)
+            f = self.model.predict(x)
+            gradient = self.model.gradient(x, pre_label)
+            adv_label = np.argmax(f)
+            logging.info('iteration = {}, f = {}, pre_label = {}'
+                         ', adv_label={}'.format(iteration, f[pre_label],
+                                                 pre_label, adv_label))
+            if adversary.try_accept_the_example(x, adv_label):
+                return adversary
+        return adversary
--- a/fluid/adversarial/advbox/attacks/gradientsign.py
+++ b/fluid/adversarial/advbox/attacks/gradientsign.py
@@ -2,37 +2,59 @@
 This module provide the attack method for FGSM's implement.
 """
 from __future__ import division
-import numpy as np
+import logging
 from collections import Iterable
+import numpy as np
 from .base import Attack
 class GradientSignAttack(Attack):
    """
    This attack was originally implemented by Goodfellow et al. (2015) with the
-    infinity norm (and is known as the "Fast Gradient Sign Method"). This is therefore called
+    infinity norm (and is known as the "Fast Gradient Sign Method").
-    the Fast Gradient Method.
+    This is therefore called the Fast Gradient Method.
    Paper link: https://arxiv.org/abs/1412.6572
    """
-    def _apply(self, image_label, epsilons=1000):
+    def _apply(self, adversary, epsilons=1000):
-        assert len(image_label) == 1
+        """
-        pre_label = np.argmax(self.model.predict(image_label))
+          Apply the gradient sign attack.
+          Args:
+              adversary(Adversary): The Adversary object.
+              epsilons(list|tuple|int): The epsilon (input variation parameter).
+          Return:
+              adversary: The Adversary object.
+          """
+        assert adversary is not None
+        if not isinstance(epsilons, Iterable):
+            epsilons = np.linspace(0, 1, num=epsilons + 1)[1:]
+        pre_label = adversary.original_label
        min_, max_ = self.model.bounds()
-        gradient = self.model.gradient(image_label)
-        gradient_sign = np.sign(gradient) * (max_ - min_)
-        if not isinstance(epsilons, Iterable):
+        if adversary.is_targeted_attack:
-            epsilons = np.linspace(0, 1, num=epsilons + 1)
+            gradient = self.model.gradient(adversary.original,
+                                           adversary.target_label)
+            gradient_sign = -np.sign(gradient) * (max_ - min_)
+        else:
+            gradient = self.model.gradient(adversary.original,
+                                           adversary.original_label)
+            gradient_sign = np.sign(gradient) * (max_ - min_)
        for epsilon in epsilons:
-            adv_img = image_label[0][0].reshape(
+            adv_img = adversary.original + epsilon * gradient_sign
-                gradient_sign.shape) + epsilon * gradient_sign
            adv_img = np.clip(adv_img, min_, max_)
-            adv_label = np.argmax(self.model.predict([(adv_img, 0)]))
+            adv_label = np.argmax(self.model.predict(adv_img))
-            if pre_label != adv_label:
+            logging.info('epsilon = {:.3f}, pre_label = {}, adv_label={}'.
-                return adv_img
+                         format(epsilon, pre_label, adv_label))
+            if adversary.try_accept_the_example(adv_img, adv_label):
+                return adversary
+        return adversary
 FGSM = GradientSignAttack
--- a/fluid/adversarial/advbox/attacks/iterator_gradientsign.py
+++ b/fluid/adversarial/advbox/attacks/iterator_gradientsign.py
@@ -2,8 +2,12 @@
 This module provide the attack method for Iterator FGSM's implement.
 """
 from __future__ import division
-import numpy as np
+import logging
 from collections import Iterable
+import numpy as np
 from .base import Attack
@@ -13,31 +17,43 @@ class IteratorGradientSignAttack(Attack):
    Paper link: https://arxiv.org/pdf/1607.02533.pdf
    """
-    def _apply(self, image_label, epsilons=100, steps=10):
+    def _apply(self, adversary, epsilons=100, steps=10):
        """
        Apply the iterative gradient sign attack.
        Args:
-            image_label(list): The image and label tuple list of one element.
+            adversary(Adversary): The Adversary object.
            epsilons(list|tuple|int): The epsilon (input variation parameter).
            steps(int): The number of iterator steps.
        Return:
-            numpy.ndarray: The adversarail sample generated by the algorithm.
+            adversary(Adversary): The Adversary object.
        """
-        assert len(image_label) == 1
-        pre_label = np.argmax(self.model.predict(image_label))
-        gradient = self.model.gradient(image_label)
-        min_, max_ = self.model.bounds()
        if not isinstance(epsilons, Iterable):
-            epsilons = np.linspace(0, 1, num=epsilons + 1)
+            epsilons = np.linspace(0, 1 / steps, num=epsilons + 1)[1:]
+        pre_label = adversary.original_label
+        min_, max_ = self.model.bounds()
        for epsilon in epsilons:
-            adv_img = image_label[0][0].reshape(gradient.shape)
+            adv_img = adversary.original
            for _ in range(steps):
-                gradient = self.model.gradient([(adv_img, image_label[0][1])])
+                if adversary.is_targeted_attack:
-                gradient_sign = np.sign(gradient) * (max_ - min_)
+                    gradient = self.model.gradient(adversary.original,
-                adv_img = adv_img + epsilon * gradient_sign
+                                                   adversary.target_label)
+                    gradient_sign = -np.sign(gradient) * (max_ - min_)
+                else:
+                    gradient = self.model.gradient(adversary.original,
+                                                   adversary.original_label)
+                    gradient_sign = np.sign(gradient) * (max_ - min_)
+                adv_img = adv_img + gradient_sign * epsilon
                adv_img = np.clip(adv_img, min_, max_)
-                adv_label = np.argmax(self.model.predict([(adv_img, 0)]))
+                adv_label = np.argmax(self.model.predict(adv_img))
-                if pre_label != adv_label:
+                logging.info('epsilon = {:.3f}, pre_label = {}, adv_label={}'.
-                    return adv_img
+                             format(epsilon, pre_label, adv_label))
+                if adversary.try_accept_the_example(adv_img, adv_label):
+                    return adversary
+        return adversary
+IFGSM = IteratorGradientSignAttack
--- a/fluid/adversarial/advbox/models/__init__.py
+++ b/fluid/adversarial/advbox/models/__init__.py
 """
-Paddle model for target of attack 
+Paddle model for target of attack
 """
+from .base import Model
+from .paddle import PaddleModel
--- a/fluid/adversarial/advbox/models/base.py
+++ b/fluid/adversarial/advbox/models/base.py
@@ -2,21 +2,21 @@
 The base model of the model.
 """
 from abc import ABCMeta
-import abc
+from abc import abstractmethod
-abstractmethod = abc.abstractmethod
+import numpy as np
 class Model(object):
    """
    Base class of model to provide attack.
    Args:
        bounds(tuple): The lower and upper bound for the image pixel.
-        channel_axis(int): The index of the axis that represents the color channel.
+        channel_axis(int): The index of the axis that represents the color
-        preprocess(tuple): Two element tuple used to preprocess the input. First
+                channel.
-            substract the first element, then divide the second element.
+        preprocess(tuple): Two element tuple used to preprocess the input.
+            First substract the first element, then divide the second element.
    """
    __metaclass__ = ABCMeta
@@ -43,25 +43,32 @@ class Model(object):
        return self._channel_axis
    def _process_input(self, input_):
-        res = input_
+        res = None
        sub, div = self._preprocess
-        if sub != 0:
+        if np.any(sub != 0):
            res = input_ - sub
-        assert div != 0
+        assert np.any(div != 0)
-        if div != 1:
+        if np.any(div != 1):
-            res /= div
+            if res is None:  # "res = input_ - sub" is not executed!
+                res = input_ / div
+            else:
+                res /= div
+        if res is None:  # "res = (input_ - sub)/ div" is not executed!
+            return input_
        return res
    @abstractmethod
-    def predict(self, image_batch):
+    def predict(self, data):
        """
-        Calculate the prediction of the image batch.
+        Calculate the prediction of the data.
        Args:
-            image_batch(numpy.ndarray): image batch of shape (batch_size, height, width, channels).
+            data(numpy.ndarray): input data with shape (size,
+            height, width, channels).
        Return:
-            numpy.ndarray: predictions of the images with shape (batch_size, num_of_classes).
+            numpy.ndarray: predictions of the data with shape (batch_size,
+                num_of_classes).
        """
        raise NotImplementedError
@@ -76,15 +83,17 @@ class Model(object):
        raise NotImplementedError
    @abstractmethod
-    def gradient(self, image_batch):
+    def gradient(self, data, label):
        """
        Calculate the gradient of the cross-entropy loss w.r.t the image.
        Args:
-            image_batch(list): The image and label tuple list.
+            data(numpy.ndarray): input data with shape (size, height, width,
+            channels).
+            label(int): Label used to calculate the gradient.
        Return:
-            numpy.ndarray: gradient of the cross-entropy loss w.r.t the image with
+            numpy.ndarray: gradient of the cross-entropy loss w.r.t the image
-                the shape (height, width, channel).
+                with the shape (height, width, channel).
        """
        raise NotImplementedError
--- a/fluid/adversarial/advbox/models/paddle.py
+++ b/fluid/adversarial/advbox/models/paddle.py
+"""
+Paddle model
+"""
 from __future__ import absolute_import
 import numpy as np
-import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
-from paddle.v2.fluid.framework import program_guard
 from .base import Model
@@ -11,10 +12,12 @@ from .base import Model
 class PaddleModel(Model):
    """
    Create a PaddleModel instance.
-    When you need to generate a adversarial sample, you should construct an instance of PaddleModel.
+    When you need to generate a adversarial sample, you should construct an
+    instance of PaddleModel.
    Args:
-        program(paddle.v2.fluid.framework.Program): The program of the model which generate the adversarial sample.
+        program(paddle.v2.fluid.framework.Program): The program of the model
+            which generate the adversarial sample.
        input_name(string): The name of the input.
        logits_name(string): The name of the logits.
        predict_name(string): The name of the predict.
@@ -30,12 +33,12 @@ class PaddleModel(Model):
                 bounds,
                 channel_axis=3,
                 preprocess=None):
-        super(PaddleModel, self).__init__(
-            bounds=bounds, channel_axis=channel_axis, preprocess=preprocess)
        if preprocess is None:
            preprocess = (0, 1)
+        super(PaddleModel, self).__init__(
+            bounds=bounds, channel_axis=channel_axis, preprocess=preprocess)
        self._program = program
        self._place = fluid.CPUPlace()
        self._exe = fluid.Executor(self._place)
@@ -49,30 +52,36 @@ class PaddleModel(Model):
        loss = self._program.block(0).var(self._cost_name)
        param_grads = fluid.backward.append_backward(
            loss, parameter_list=[self._input_name])
-        self._gradient = dict(param_grads)[self._input_name]
+        self._gradient = filter(lambda p: p[0].name == self._input_name,
+                                param_grads)[0][1]
-    def predict(self, image_batch):
+    def predict(self, data):
        """
-            Predict the label of the image_batch.
+        Calculate the prediction of the data.
-            Args:
+        Args:
-                image_batch(list): The image and label tuple list.
+            data(numpy.ndarray): input data with shape (size,
-            Return:
+            height, width, channels).
-                numpy.ndarray: predictions of the images with shape (batch_size, num_of_classes).
+        Return:
+            numpy.ndarray: predictions of the data with shape (batch_size,
+                num_of_classes).
        """
+        scaled_data = self._process_input(data)
        feeder = fluid.DataFeeder(
            feed_list=[self._input_name, self._logits_name],
            place=self._place,
            program=self._program)
        predict_var = self._program.block(0).var(self._predict_name)
        predict = self._exe.run(self._program,
-                                feed=feeder.feed(image_batch),
+                                feed=feeder.feed([(scaled_data, 0)]),
                                fetch_list=[predict_var])
+        predict = np.squeeze(predict, axis=0)
        return predict
    def num_classes(self):
        """
-            Calculate the number of classes of the output label. 
+            Calculate the number of classes of the output label.
        Return:
            int: the number of classes
@@ -81,21 +90,27 @@ class PaddleModel(Model):
        assert len(predict_var.shape) == 2
        return predict_var.shape[1]
-    def gradient(self, image_batch):
+    def gradient(self, data, label):
        """
-        Calculate the gradient of the loss w.r.t the input.
+        Calculate the gradient of the cross-entropy loss w.r.t the image.
        Args:
-            image_batch(list): The image and label tuple list.
+            data(numpy.ndarray): input data with shape (size, height, width,
+            channels).
+            label(int): Label used to calculate the gradient.
        Return:
-            list: The list of the gradient of the image.
+            numpy.ndarray: gradient of the cross-entropy loss w.r.t the image
+                with the shape (height, width, channel).
        """
+        scaled_data = self._process_input(data)
        feeder = fluid.DataFeeder(
            feed_list=[self._input_name, self._logits_name],
            place=self._place,
            program=self._program)
        grad, = self._exe.run(self._program,
-                              feed=feeder.feed(image_batch),
+                              feed=feeder.feed([(scaled_data, label)]),
                              fetch_list=[self._gradient])
-        return grad
+        return grad.reshape(data.shape)
--- a/fluid/adversarial/mnist_tutorial_fgsm.py
+++ b/fluid/adversarial/mnist_tutorial_fgsm.py
 """
 FGSM demos on mnist using advbox tool.
 """
+import matplotlib.pyplot as plt
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
-import matplotlib.pyplot as plt
-import numpy as np
-from advbox.models.paddle import PaddleModel
+from advbox import Adversary
 from advbox.attacks.gradientsign import GradientSignAttack
+from advbox.models.paddle import PaddleModel
 def cnn_model(img):
@@ -18,7 +18,7 @@ def cnn_model(img):
    Returns:
        Variable: the label prediction
    """
-    #conv1 = fluid.nets.conv2d()
+    # conv1 = fluid.nets.conv2d()
    conv_pool_1 = fluid.nets.simple_img_conv_pool(
        input=img,
        num_filters=20,
@@ -76,10 +76,11 @@ def main():
    att = GradientSignAttack(m)
    for data in train_reader():
        # fgsm attack
-        adv_img = att(data)
+        adversary = att(Adversary(data[0][0], data[0][1]))
-        plt.imshow(n[0][0], cmap='Greys_r')
+        if adversary.is_successful():
-        plt.show()
+            plt.imshow(adversary.target, cmap='Greys_r')
-        #np.save('adv_img', adv_img)
+            plt.show()
+            # np.save('adv_img', adversary.target)
        break

--- a/fluid/image_classification/README.md
+++ b/fluid/image_classification/README.md
+# SE-ResNeXt for image classification
+This model built with paddle fluid is still under active development and is not
+the final version. We welcome feedbacks.
--- a/fluid/image_classification/reader.py
+++ b/fluid/image_classification/reader.py
 import os
+import math
 import random
 import functools
 import numpy as np
@@ -7,10 +8,6 @@ from PIL import Image, ImageEnhance
 random.seed(0)
-_R_MEAN = 123.0
-_G_MEAN = 117.0
-_B_MEAN = 104.0
 DATA_DIM = 224
 THREAD = 8
@@ -20,7 +17,8 @@ DATA_DIR = 'ILSVRC2012'
 TRAIN_LIST = 'ILSVRC2012/train_list.txt'
 TEST_LIST = 'ILSVRC2012/test_list.txt'
-img_mean = np.array([_R_MEAN, _G_MEAN, _B_MEAN]).reshape((3, 1, 1))
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
 def resize_short(img, target_size):
@@ -46,6 +44,36 @@ def crop_image(img, target_size, center):
    return img
+def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
+    aspect_ratio = math.sqrt(random.uniform(*ratio))
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+    bound = min((float(img.size[0]) / img.size[1]) / (w**2),
+                (float(img.size[1]) / img.size[0]) / (h**2))
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+    target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
+                                                             scale_max)
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+    i = random.randint(0, img.size[0] - w)
+    j = random.randint(0, img.size[1] - h)
+    img = img.crop((i, j, i + w, j + h))
+    img = img.resize((size, size), Image.LANCZOS)
+    return img
+def rotate_image(img):
+    angle = random.randint(-10, 10)
+    img = img.rotate(angle)
+    return img
 def distort_color(img):
    def random_brightness(img, lower=0.5, upper=1.5):
        e = random.uniform(lower, upper)
@@ -69,25 +97,28 @@ def distort_color(img):
    return img
-def process_image(sample, mode):
+def process_image(sample, mode, color_jitter, rotate):
    img_path = sample[0]
    img = Image.open(img_path)
    if mode == 'train':
-        img = resize_short(img, DATA_DIM + 32)
+        if rotate: img = rotate_image(img)
+        img = random_crop(img, DATA_DIM)
    else:
        img = resize_short(img, DATA_DIM)
-    img = crop_image(img, target_size=DATA_DIM, center=(mode != 'train'))
+        img = crop_image(img, target_size=DATA_DIM, center=True)
    if mode == 'train':
-        img = distort_color(img)
+        if color_jitter:
+            img = distort_color(img)
        if random.randint(0, 1) == 1:
            img = img.transpose(Image.FLIP_LEFT_RIGHT)
    if img.mode != 'RGB':
        img = img.convert('RGB')
-    img = np.array(img).astype('float32').transpose((2, 0, 1))
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
    img -= img_mean
+    img /= img_std
    if mode == 'train' or mode == 'test':
        return img, sample[1]
@@ -95,7 +126,11 @@ def process_image(sample, mode):
        return img
-def _reader_creator(file_list, mode, shuffle=False):
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False):
    def reader():
        with open(file_list) as flist:
            lines = [line.strip() for line in flist]
@@ -110,13 +145,15 @@ def _reader_creator(file_list, mode, shuffle=False):
                    img_path = os.path.join(DATA_DIR, line)
                    yield [img_path]
-    mapper = functools.partial(process_image, mode=mode)
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
 def train():
-    return _reader_creator(TRAIN_LIST, 'train', shuffle=True)
+    return _reader_creator(
+        TRAIN_LIST, 'train', shuffle=True, color_jitter=True, rotate=True)
 def test():

--- a/fluid/image_classification/se_resnext.py
+++ b/fluid/image_classification/se_resnext.py
 import os
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import reader
@@ -35,7 +34,11 @@ def squeeze_excitation(input, num_channels, reduction_ratio):
 def shortcut(input, ch_out, stride):
    ch_in = input.shape[1]
    if ch_in != ch_out:
-        return conv_bn_layer(input, ch_out, 3, stride)
+        if stride == 1:
+            filter_size = 1
+        else:
+            filter_size = 3
+        return conv_bn_layer(input, ch_out, filter_size, stride)
    else:
        return input
@@ -75,7 +78,7 @@ def SE_ResNeXt(input, class_dim, infer=False):
    conv = conv_bn_layer(
        input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
    conv = fluid.layers.pool2d(
-        input=conv, pool_size=3, pool_stride=2, pool_type='max')
+        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
    for block in range(len(depth)):
        for i in range(depth[block]):
@@ -96,7 +99,11 @@ def SE_ResNeXt(input, class_dim, infer=False):
    return out
-def train(learning_rate, batch_size, num_passes, model_save_dir='model'):
+def train(learning_rate,
+          batch_size,
+          num_passes,
+          init_model=None,
+          model_save_dir='model'):
    class_dim = 1000
    image_shape = [3, 224, 224]
@@ -109,9 +116,9 @@ def train(learning_rate, batch_size, num_passes, model_save_dir='model'):
    avg_cost = fluid.layers.mean(x=cost)
    optimizer = fluid.optimizer.Momentum(
-        learning_rate=learning_rate / batch_size,
+        learning_rate=learning_rate,
        momentum=0.9,
-        regularization=fluid.regularizer.L2Decay(1e-4 * batch_size))
+        regularization=fluid.regularizer.L2Decay(1e-4))
    opts = optimizer.minimize(avg_cost)
    accuracy = fluid.evaluator.Accuracy(input=out, label=label)
@@ -125,6 +132,9 @@ def train(learning_rate, batch_size, num_passes, model_save_dir='model'):
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
+    if init_model is not None:
+        fluid.io.load_persistables_if_exist(exe, init_model)
    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
@@ -141,16 +151,18 @@ def train(learning_rate, batch_size, num_passes, model_save_dir='model'):
        test_accuracy.reset(exe)
        for data in test_reader():
-            out, acc = exe.run(inference_program,
+            loss, acc = exe.run(inference_program,
-                               feed=feeder.feed(data),
+                                feed=feeder.feed(data),
-                               fetch_list=[avg_cost] + test_accuracy.metrics)
+                                fetch_list=[avg_cost] + test_accuracy.metrics)
        test_pass_acc = test_accuracy.eval(exe)
        print("End pass {0}, train_acc {1}, test_acc {2}".format(
            pass_id, pass_acc, test_pass_acc))
        model_path = os.path.join(model_save_dir, str(pass_id))
-        fluid.io.save_inference_model(model_path, ['image'], [out], exe)
+        if not os.path.isdir(model_path):
+            os.makedirs(model_path)
+        fluid.io.save_persistables(exe, model_path)
 if __name__ == '__main__':
-    train(learning_rate=0.1, batch_size=7, num_passes=100)
+    train(learning_rate=0.1, batch_size=8, num_passes=100, init_model=None)
--- a/fluid/ocr_recognition/ctc_reader.py
+++ b/fluid/ocr_recognition/ctc_reader.py
+import os
+import cv2
+import numpy as np
+from PIL import Image
+from paddle.v2.image import load_image
+class DataGenerator(object):
+    def __init__(self):
+        pass
+    def train_reader(self, img_root_dir, img_label_list, batchsize):
+        '''
+        Reader interface for training.
+        :param img_root_dir: The root path of the image for training.
+        :type file_list: str 
+        :param img_label_list: The path of the <image_name, label> file for training.
+        :type file_list: str 
+        '''
+        img_label_lines = []
+        if batchsize == 1:
+            to_file = "tmp.txt"
+            cmd = "cat " + img_label_list + " | awk '{print $1,$2,$3,$4;}' | shuf > " + to_file
+            print "cmd: " + cmd
+            os.system(cmd)
+            print "finish batch shuffle"
+            img_label_lines = open(to_file, 'r').readlines()
+        else:
+            to_file = "tmp.txt"
+            #cmd1: partial shuffle
+            cmd = "cat " + img_label_list + " | awk '{printf(\"%04d%.4f %s\\n\", $1, rand(), $0)}' | sort | sed 1,$((1 + RANDOM % 100))d | "
+            #cmd2: batch merge and shuffle
+            cmd += "awk '{printf $2\" \"$3\" \"$4\" \"$5\" \"; if(NR % " + str(
+                batchsize) + " == 0) print \"\";}' | shuf | "
+            #cmd3: batch split
+            cmd += "awk '{if(NF == " + str(
+                batchsize
+            ) + " * 4) {for(i = 0; i < " + str(
+                batchsize
+            ) + "; i++) print $(4*i+1)\" \"$(4*i+2)\" \"$(4*i+3)\" \"$(4*i+4);}}' > " + to_file
+            print "cmd: " + cmd
+            os.system(cmd)
+            print "finish batch shuffle"
+            img_label_lines = open(to_file, 'r').readlines()
+        def reader():
+            sizes = len(img_label_lines) / batchsize
+            for i in range(sizes):
+                result = []
+                sz = [0, 0]
+                for j in range(batchsize):
+                    line = img_label_lines[i * batchsize + j]
+                    # h, w, img_name, labels
+                    items = line.split(' ')
+                    label = [int(c) for c in items[-1].split(',')]
+                    img = Image.open(os.path.join(img_root_dir, items[
+                        2])).convert('L')  #zhuanhuidu
+                    if j == 0:
+                        sz = img.size
+                    img = img.resize((sz[0], sz[1]))
+                    img = np.array(img) - 127.5
+                    img = img[np.newaxis, ...]
+                    result.append([img, label])
+                yield result
+        return reader
+    def test_reader(self, img_root_dir, img_label_list):
+        '''
+        Reader interface for inference.
+        :param img_root_dir: The root path of the images for training.
+        :type file_list: str 
+        :param img_label_list: The path of the <image_name, label> file for testing.
+        :type file_list: list
+        '''
+        def reader():
+            for line in open(img_label_list):
+                # h, w, img_name, labels
+                items = line.split(' ')
+                label = [int(c) for c in items[-1].split(',')]
+                img = Image.open(os.path.join(img_root_dir, items[2])).convert(
+                    'L')
+                img = np.array(img) - 127.5
+                img = img[np.newaxis, ...]
+                yield img, label
+        return reader
--- a/fluid/text_classification/README.md
+++ b/fluid/text_classification/README.md
+# Text Classification
+## Data Preparation
+```
+wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
+tar zxf aclImdb_v1.tar.gz
+```
+## Training
+```
+python train.py --dict_path 'aclImdb/imdb.vocab'
+```
--- a/fluid/text_classification/config.py
+++ b/fluid/text_classification/config.py
+class TrainConfig(object):
+    # Whether to use GPU in training or not.
+    use_gpu = False
+    # The training batch size.
+    batch_size = 4
+    # The epoch number.
+    num_passes = 30
+    # The global learning rate.
+    learning_rate = 0.01
+    # Training log will be printed every log_period.
+    log_period = 100
--- a/fluid/text_classification/train.py
+++ b/fluid/text_classification/train.py
+import numpy as np
+import sys
+import os
+import argparse
+import time
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+from config import TrainConfig as conf
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--dict_path',
+        type=str,
+        required=True,
+        help="Path of the word dictionary.")
+    return parser.parse_args()
+# Define to_lodtensor function to process the sequential data.
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+# Load the dictionary.
+def load_vocab(filename):
+    vocab = {}
+    with open(filename) as f:
+        for idx, line in enumerate(f):
+            vocab[line.strip()] = idx
+    return vocab
+# Define the convolution model.
+def conv_net(dict_dim,
+             window_size=3,
+             emb_dim=128,
+             num_filters=128,
+             fc0_dim=96,
+             class_dim=2):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=num_filters,
+        filter_size=window_size,
+        act="tanh",
+        pool_type="max")
+    fc_0 = fluid.layers.fc(input=[conv_3], size=fc0_dim)
+    prediction = fluid.layers.fc(input=[fc_0], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    return data, label, prediction, avg_cost
+def main(dict_path):
+    word_dict = load_vocab(dict_path)
+    word_dict["<unk>"] = len(word_dict)
+    dict_dim = len(word_dict)
+    print("The dictionary size is : %d" % dict_dim)
+    data, label, prediction, avg_cost = conv_net(dict_dim)
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=conf.learning_rate)
+    sgd_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        test_target = accuracy.metrics + accuracy.states
+        inference_program = fluid.io.get_inference_program(test_target)
+    # The training data set.
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=51200),
+        batch_size=conf.batch_size)
+    # The testing data set.
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.test(word_dict), buf_size=51200),
+        batch_size=conf.batch_size)
+    if conf.use_gpu:
+        place = fluid.CUDAPlace(0)
+    else:
+        place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+    exe.run(fluid.default_startup_program())
+    def test(exe):
+        accuracy.reset(exe)
+        for batch_id, data in enumerate(test_reader()):
+            input_seq = to_lodtensor(map(lambda x: x[0], data), place)
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+            acc = exe.run(inference_program,
+                          feed={"words": input_seq,
+                                "label": y_data})
+        test_acc = accuracy.eval(exe)
+        return test_acc
+    total_time = 0.
+    for pass_id in xrange(conf.num_passes):
+        accuracy.reset(exe)
+        start_time = time.time()
+        for batch_id, data in enumerate(train_reader()):
+            cost_val, acc_val = exe.run(
+                fluid.default_main_program(),
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost, accuracy.metrics[0]])
+            pass_acc = accuracy.eval(exe)
+            if batch_id and batch_id % conf.log_period == 0:
+                print("Pass id: %d, batch id: %d, cost: %f, pass_acc %f" %
+                      (pass_id, batch_id, cost_val, pass_acc))
+        end_time = time.time()
+        total_time += (end_time - start_time)
+        pass_test_acc = test(exe)
+        print("Pass id: %d, test_acc: %f" % (pass_id, pass_test_acc))
+    print("Total train time: %f" % (total_time))
+if __name__ == '__main__':
+    args = parse_args()
+    main(args.dict_path)
--- a/nce_cost/README.md
+++ b/nce_cost/README.md
@@ -106,7 +106,6 @@ NCE 层的一些重要参数解释如下：
 | param\_attr / bias\_attr | 用来设置参数名字 |方便预测阶段加载参数，具体在预测一节中介绍。|
 | num\_neg\_samples | 负样本采样个数|可以控制正负样本比例，这个值取值区间为 [1, 字典大小-1]，负样本个数越多则整个模型的训练速度越慢，模型精度也会越高 |
 | neg\_distribution | 生成负样例标签的分布，默认是一个均匀分布| 可以自行控制负样本采样时各个类别的采样权重。例如：希望正样例为“晴天”时，负样例“洪水”在训练时更被着重区分，则可以将“洪水”这个类别的采样权重增加|
-| act | 使用何种激活函数| 根据 NCE 的原理，这里应该使用 sigmoid 函数 |
 ## 预测
 1. 在命令行运行 :

--- a/ssd/README.cn.md
+++ b/ssd/README.cn.md
@@ -82,7 +82,7 @@ SSD使用一个卷积神经网络实现“端到端”的检测：输入为原
    文件共两个字段，第一个字段为图像文件的相对路径，第二个字段为对应标注文件的相对路径。
 ### 预训练模型准备
-下载预训练的VGG-16模型，我们提供了一个转换好的模型，下载好模型后，放置路径为```vgg/vgg_model.tar.gz```。
+下载预训练的VGG-16模型，我们提供了一个转换好的模型，下载模型[http://paddlepaddle.bj.bcebos.com/model_zoo/detection/ssd_model/vgg_model.tar.gz](http://paddlepaddle.bj.bcebos.com/model_zoo/detection/ssd_model/vgg_model.tar.gz)，并将其放置路径为```vgg/vgg_model.tar.gz```。
 ### 模型训练
 直接执行```python train.py```即可进行训练。需要注意本示例仅支持CUDA GPU环境，无法在CPU上训练，主要因为使用CPU训练速度很慢，实践中一般使用GPU来处理图像任务，这里实现采用硬编码方式使用cuDNN，不提供CPU版本。```train.py```的一些关键执行逻辑：

--- a/text_classification/train.py
+++ b/text_classification/train.py
@@ -46,7 +46,7 @@ def train(topology,
        word_dict = paddle.dataset.imdb.word_dict()
        train_reader = paddle.batch(
            paddle.reader.shuffle(
-                lambda: paddle.dataset.imdb.train(word_dict)(), buf_size=1000),
+                lambda: paddle.dataset.imdb.train(word_dict)(), buf_size=51200),
            batch_size=100)
        test_reader = paddle.batch(
            lambda: paddle.dataset.imdb.test(word_dict)(), batch_size=100)
@@ -83,16 +83,14 @@ def train(topology,
        train_reader = paddle.batch(
            paddle.reader.shuffle(
                reader.train_reader(train_data_dir, word_dict, lbl_dict),
-                buf_size=1000),
+                buf_size=51200),
            batch_size=batch_size)
        if test_data_dir is not None:
            # here, because training and testing data share a same format,
            # we still use the reader.train_reader to read the testing data.
            test_reader = paddle.batch(
-                paddle.reader.shuffle(
+                reader.train_reader(test_data_dir, word_dict, lbl_dict),
-                    reader.train_reader(test_data_dir, word_dict, lbl_dict),
-                    buf_size=1000),
                batch_size=batch_size)
        else:
            test_reader = None

--- a/youtube_recall/README.cn.md
+++ b/youtube_recall/README.cn.md
+# Youtube DNN推荐模型
+以下是本例目录包含的文件以及对应说明:
+```
+├── README.md               # 文档
+├── README.cn.md            # 中文文档
+├── data                    # 示例数据
+│   ├── data.tar            # 示例数据
+├── infer.py                # 预测脚本
+├── network_conf.py         # 模型网络配置
+├── reader.py               # data reader
+├── train.py                # 训练脚本
+└── utils.py                # 工具
+└── data_processer.py       # 数据预处理脚本
+└── user_vector.py          # 获取用户向量脚本
+└── item_vector.py          # 获取视频向量脚本
+├── infer_user.py           # 获取用户个性化脚本
+```
+## 背景介绍\[[1](#参考文献)\]
+Youtube是世界最大的视频网站之一，其推荐系统帮助10亿以上的用户，从海量视频中，发现个性化的内容。该推荐系统主要面临以下三个挑战:
+- 规模: 许多现有的推荐算法证明在小数据量下运行良好，但不能满足YouTube这样庞大的用户群和内容库的场景，因此需要高度专业化的分布式学习算法和高效的线上服务。
+- 新鲜度: YouTube内容库更新频率极高，每秒上传大量视频。系统应及时追踪新上传的视频和用户的实时行为，并且模型在推荐新/旧视频上有良好平衡能力。
+- 噪音: 噪音来自于两方面，其一，用户历史行为稀疏，且有各种不可观测的外部因素，以及用户满意度不明确。其二，内容本身的数据是非结构化的。因此算法应更具有鲁棒性。
+下图展示了整个推荐系统框图:
+<p align="center">
+<img src="images/recommendation_system.png" width="500" height="300" hspace='10'/> <br/>
+Figure 1. 推荐系统框图（出自论文[1]）
+</p>
+整个推荐系统有两部分组成: 召回(candidate generation/recall)和排序(ranking)。
+- 召回模型: 输入用户的历史行为，从大规模的内容库中获得一个小集合(百级别)。召回出的视频与用户高度相关。一个用户是用其历史点击过的视频，搜索过的关键词，和人口统计相关的特征来表征。
+- 排序模型: 采用更精细的特征计算得到排序分，对召回得到的候选集合中的视频进行排序。
+本文主要详细介绍了召回模型的原理与使用。
+## 召回模型简介
+该推荐问题可以被建模成一个"超大规模多分类"问题。即在时刻![](https://www.zhihu.com/equation?tex=t)，为用户![](https://www.zhihu.com/equation?tex=U)(已知上下文信息![](https://www.zhihu.com/equation?tex=C))在视频库![](https://www.zhihu.com/equation?tex=V)中预测出观看视频![](https://www.zhihu.com/equation?tex=i)的类别，
+![](https://www.zhihu.com/equation?tex=%24P(%5Comega_t%3Di%7CU%2CC)%3D%5Cfrac%7Be%5E%7B%5Cmathbf%7Bv_i%7D%5Cmathbf%7Bu%7D%7D%7D%7B%5Csum_%7Bj%5Cin%20V%7D%5E%7B%20%7De%5E%7B%5Cmathbf%7Bv_j%7D%5Cmathbf%7Bu%7D%7D%7D)
+其中![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Cin%20%5Cmathbb%7BR%7D%5EN)，是<用户，上下文信息>的高维向量表示。![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv_j%7D%5Cin%20%5Cmathbb%7BR%7D%5EN)是视频![](https://www.zhihu.com/equation?tex=j)的高维向量表示。DNN模型的目标是以用户信息和上下文信息为输入条件下，学习用户的高维向量表示，以此输入softmax分类器，来预测视频库中各个视频(类别)的观看概率。
+下图展示了召回模型的网络结构:
+<p align="center">
+<img src="images/model_network.png" width="600" height="500" hspace='10'/> <br/>
+Figure 2. 召回模型网络结构（出自论文[1]）
+</p>
+- 输入层:用户的浏览序列、搜索序列、人口统计学特征、和其他上下文信息等
+- embedding层:将用户浏览视频序列接embedding层，再做时间序列上的平均。对于搜索序列同样处理。
+- 隐层:包含三个隐层，用RELU激活函数，最后一层隐层的输出即为高维向量表示![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D)。
+- 输出层: softmax层，输出视频库中各个视频(类别)的观看概率。在线上预测时，提取模型训练得到的softmax层内部的参数，作为视频![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D)的高维向量表示。可利用类似局部敏感哈希(Locality Sensitive Hashing)用![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D)查询最相关的N个视频。
+## 数据预处理
+本例模拟了用户的视频点击日志，作为样本数据。格式如下:
+```
+用户Id \t 所在省份 \t 所在城市 \t 历史点击的视频序列信息 \t 手机型号
+历史点击的视频序列信息的格式为 视频信息1;视频信息2;...;视频信息K
+视频信息的格式为 视频id:视频类目:视频标签1_视频标签2_视频标签3_...视频标签M
+例如:
+USER_ID_15  上海市  上海市    VIDEO_42:CATEGORY_9:TAG115;VIDEO_43:CATEGORY_9:TAG116_TAG115;VIDEO_44:CATEGORY_2:TAG117_TAG71  GO T5
+```
+在youtube_recall目录下运行以下命令（下同），可以解压样本数据。
+```
+cd data
+tar -zxvf data.tar
+```
+然后，脚本`data_preprocess.py`将对训练数据做预处理。具体使用方法参考如下说明：
+```
+usage: data_processor.py [-h] --train_set_path TRAIN_SET_PATH --output_dir
+                         OUTPUT_DIR [--feat_appear_limit FEAT_APPEAR_LIMIT]
+PaddlePaddle Youtube Recall Model Example
+optional arguments:
+  -h， --help            show this help message and exit
+  --train_set_path TRAIN_SET_PATH
+                        path of the train set
+  --output_dir OUTPUT_DIR
+                        directory to output
+  --feat_appear_limit FEAT_APPEAR_LIMIT
+                        the minimum number of feature values appears (default:
+                        20)
+```
+该脚本的作用如下:
+- 借鉴\[[2](#参考文献)\]中对特征的处理，过滤低频特征(样本中出现次数低于`feat_appear_limit`)。
+- 对特征进行编码，生成字典`feature_dict.pkl`。
+- 统计每个视频出现的概率，保存至`item_freq.pkl`，提供给nce层使用。
+例如可执行下列命令，完成数据预处理:
+```shell
+mkdir output
+python data_processor.py --train_set_path=./data/train.txt \
+                                     --output_dir=./output \
+                                     --feat_appear_limit=20
+```
+## 模型实现
+下面是网络中各个部分的具体实现，相关代码均包含在 `./network_conf.py` 中。
+### 输入层
+```python
+def _build_input_layer(self):
+    """
+    build input layer
+    """
+    self._history_clicked_items = paddle.layer.data(
+        name="history_clicked_items", type=paddle.data_type.integer_value_sequence(
+            len(self._feature_dict['history_clicked_items'])))
+    self._history_clicked_categories = paddle.layer.data(
+        name="history_clicked_categories", type=paddle.data_type.integer_value_sequence(
+            len(self._feature_dict['history_clicked_categories'])))
+    self._history_clicked_tags = paddle.layer.data(
+        name="history_clicked_tags", type=paddle.data_type.integer_value_sequence(
+            len(self._feature_dict['history_clicked_tags'])))
+    self._user_id = paddle.layer.data(
+        name="user_id", type=paddle.data_type.integer_value(
+            len(self._feature_dict['user_id'])))
+    self._province = paddle.layer.data(
+        name="province", type=paddle.data_type.integer_value(
+            len(self._feature_dict['province'])))
+    self._city = paddle.layer.data(
+        name="city", type=paddle.data_type.integer_value(len(self._feature_dict['city'])))
+    self._phone = paddle.layer.data(
+        name="phone", type=paddle.data_type.integer_value(len(self._feature_dict['phone'])))
+    self._target_item = paddle.layer.data(
+        name="target_item", type=paddle.data_type.integer_value(
+            len(self._feature_dict['history_clicked_items'])))
+```
+### Embedding层
+每个输入特征通过embedding到固定维度的向量中。
+```python
+def _create_emb_attr(self, name):
+    """
+    create embedding parameter
+    """
+    return paddle.attr.Param(
+        name=name, initial_std=0.001, learning_rate=1, l2_rate=0, sparse_update=False)
+def _build_embedding_layer(self):
+    """
+    build embedding layer
+    """
+    self._user_id_emb = paddle.layer.embedding(input=self._user_id,
+                                               size=64,
+                                               param_attr=self._create_emb_attr(
+                                                   '_proj_user_id'))
+    self._province_emb = paddle.layer.embedding(input=self._province,
+                                                size=8,
+                                                param_attr=self._create_emb_attr(
+                                                    '_proj_province'))
+    self._city_emb = paddle.layer.embedding(input=self._city,
+                                            size=16,
+                                            param_attr=self._create_emb_attr('_proj_city'))
+    self._phone_emb = paddle.layer.embedding(input=self._phone,
+                                             size=16,
+                                             param_attr=self._create_emb_attr('_proj_phone'))
+    self._history_clicked_items_emb = paddle.layer.embedding(
+        input=self._history_clicked_items,
+        size=64,
+        param_attr=self._create_emb_attr('_proj_history_clicked_items'))
+    self._history_clicked_categories_emb = paddle.layer.embedding(
+        input=self._history_clicked_categories,
+        size=8,
+        param_attr=self._create_emb_attr('_proj_history_clicked_categories'))
+    self._history_clicked_tags_emb = paddle.layer.embedding(
+        input=self._history_clicked_tags,
+        size=64,
+        param_attr=self._create_emb_attr('_proj_history_clicked_tags'))
+```
+### 隐层
+本文对\[[原论文](#参考文献)\](Covington, Paul, Jay Adams, and Emre Sargin. "Deep neural networks for youtube recommendations." Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016.)中的模型做了如下改进：
+- 历史用户点击的视频序列，经过embedding之后，不再使用加权求平均，而是使用lstm序列模型。本文将用户点击的先后次序纳入模型中，然后在时间序列上做最大池化，得到定长向量表示，从而使模型学习到与点击时序相关的隐藏信息。
+- 考虑到数据规模与训练性能，本文只用了两个Relu层，也有很不错的效果。
+```python
+self._rnn_cell = paddle.networks.simple_lstm(
+            input=self._history_clicked_items_emb, size=64)
+        self._lstm_last = paddle.layer.pooling(
+            input=self._rnn_cell, pooling_type=paddle.pooling.Max())
+        self._avg_emb_cats = paddle.layer.pooling(
+            input=self._history_clicked_categories_emb,
+            pooling_type=paddle.pooling.Avg())
+        self._avg_emb_tags = paddle.layer.pooling(
+            input=self._history_clicked_tags_emb,
+            pooling_type=paddle.pooling.Avg())
+        self._fc_0 = paddle.layer.fc(
+            name="Relu1",
+            input=[
+                self._lstm_last, self._user_id_emb, self._province_emb,
+                self._city_emb, self._avg_emb_cats, self._avg_emb_tags,
+                self._phone_emb
+            ],
+            size=self._dnn_layer_dims[0],
+            act=paddle.activation.Relu())
+        self._fc_1 = paddle.layer.fc(
+            name="Relu2",
+            input=self._fc_0,
+            size=self._dnn_layer_dims[1],
+            act=paddle.activation.Relu())
+```
+### 输出层
+为了提高模型训练速度，使用噪声对比估计（Noise-contrastive estimation， NCE）\[[3](#参考文献)\]。将[数据预处理](#数据预处理)中产出的item_freq.pkl，也就是负样例的分布，作为nce层的参数。
+```python
+return paddle.layer.nce(
+                input=self._fc_1,
+                label=self._target_item,
+                num_classes=len(self._feature_dict['history_clicked_items']),
+                param_attr=paddle.attr.Param(name="nce_w"),
+                bias_attr=paddle.attr.Param(name="nce_b"),
+                act=paddle.activation.Sigmoid(),
+                num_neg_samples=5,
+                neg_distribution=self._item_freq)
+```
+## 训练
+首先，准备`reader.py`，负责将输入原始数据中的特征，转为编码后的特征id。对一条训练数据，根据`window_size`产出多条训练样本给trainer，例如:
+```
+window_size=2
+原始数据:
+用户Id \t 所在省份 \t 所在城市 \t 视频信息1;视频信息2;...;视频信息K \t 手机型号
+多条训练样本:
+用户Id，所在省份，所在城市，[<unk>，历史点击视频1]，[<unk>，历史点击视频类目1]，[<unk>，历史点击视频标签1]，手机型号，历史点击视频2
+用户Id，所在省份，所在城市，[历史点击视频1，历史点击视频2]，[历史点击视频类目1，历史点击视频类目2]，[历史点击视频标签1，历史点击视频标签2]，手机型号，历史点击视频3
+用户Id，所在省份，所在城市，[历史点击视频2，历史点击视频3]，[历史点击视频类目2，历史点击视频类目3]，[历史点击视频标签2，历史点击视频标签3]，手机型号，历史点击视频4
+......
+```
+相关代码如下:
+```python
+for i in range(1, len(history_clicked_items_all)):
+    start = max(0, i - self._window_size)
+    history_clicked_items = history_clicked_items_all[start:i]
+    history_clicked_categories = history_clicked_categories_all[start:i]
+    history_clicked_tags_str = history_clicked_tags_all[start:i]
+    history_clicked_tags = []
+    for tags_a in history_clicked_tags_str:
+        for tag in tags_a.split("_"):
+            history_clicked_tags.append(int(tag))
+    target_item = history_clicked_items_all[i]
+    yield user_id, province, city, \
+          history_clicked_items, history_clicked_categories, \
+          history_clicked_tags, phone, target_item
+```
+```python
+reader = Reader(feature_dict, args.window_size)
+    trainer.train(
+        paddle.batch(
+            paddle.reader.shuffle(
+                lambda: reader.train(args.train_set_path),
+                buf_size=7000), args.batch_size),
+        num_passes=args.num_passes,
+        feeding=feeding,
+        event_handler=event_handler)
+```
+接下去就可以开始训练了，可执行以下命令:
+```shell
+mkdir output/model
+python train.py --train_set_path='./data/train.txt' \
+    --test_set_path='./data/test.txt' \
+    --model_output_dir='./output/model/' \
+    --feature_dict='./output/feature_dict.pkl' \
+    --item_freq='./output/item_freq.pkl'
+```
+## 离线预测
+输入用户相关的特征，输出topN个最可能观看的视频，可执行以下命令:
+```shell
+python infer.py --infer_set_path='./data/infer.txt' \
+    --model_path='./output/model/model_pass_00000.tar.gz' \
+    --feature_dict='./output/feature_dict.pkl' \
+    --batch_size=50
+```
+## 在线预测
+在线预测的时候，采用近似最近邻（approximate nearest neighbor-ANN）算法直接用用户向量查询最相关的topN个视频向量，将对应的视频内容推荐给用户。下面介绍如何获得用户向量和视频向量。
+### 用户向量
+用最后一个RELU层的输出，前拼一个常数项1，作为用户向量。这边最后一个RELU层的大小是31维，拼接后的用户向量就是32维，即
+![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%3D%5B1%2Cu_1%2Cu_2%2C...%2Cu_%7B31%7D%5D)
+### 视频向量
+视频向量从模型训练得到的softmax层的参数中提取。假设共有M个不同的视频，那么softmax层输出的是这M个视频各自用户点击的概率，即
+![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bo%7D%3D%5Bs_1%2Cs_2%2C...%2Cs_%7BM%7D%5D)
+从最后一个RELU层输出的用户向量![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D)，到softmax层输出的M个视频的概率![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bo%7D)，中间则是通过乘以了softmax层的参数w,b构成的一个![](https://www.zhihu.com/equation?tex=32%5Ctimes%20M)矩阵，其中的每一列为一个32维的视频向量，按照字典顺序一一对应。
+![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Ccdot%20%5Cbegin%7Bbmatrix%7D%0A%20b_1%20%20%26%20b_2%20%26%20%20%5Ccdots%20%26%20b_M%20%5C%5C%20%0A%20w_%7B11%7D%20%26%20w_%7B21%7D%20%26%20%20%5Ccdots%20%20%26%20w_%7BM1%7D%20%5C%5C%20%0A%20w_%7B12%7D%20%26%20w_%7B22%7D%20%26%20%20%20%5Ccdots%20%26%20w_%7BM2%7D%20%20%5C%5C%20%0A%5Cvdots%20%26%20%5Cvdots%20%26%20%20%5Cvdots%20%26%20%5Cvdots%20%5C%5C%20%0Aw_%7B131%7D%20%26%20%20w_%7B231%7D%20%26%20%20%5Ccdots%20%20%26%20w_%7BM31%7D%20%20%0A%5Cend%7Bbmatrix%7D_%7B32%5Ctimes%20M%7D%20%3D%20%5Cmathbf%7Bu%7D%20%5Ccdot%20%20%5Cbegin%7Bbmatrix%7D%20%0A%5Cmathbf%7Bv_1%7D%2C%20%5Cmathbf%7Bv_2%7D%2C%20%5Ccdots%2C%20%5Cmathbf%7Bv_M%7D%20%0A%5Cend%7Bbmatrix%7D_%7B1%5Ctimes%20M%7D%3D%5Cmathbf%7Bo%7D)
+### SIMPLE-LSH变换
+很多ann算法只支持cosine距离，而模型是根据内积排序的，两者效果差异较大。为此，这边的解决方案是，对前面得到的用户和视频向量，作SIMPLE-LSH变换\[[4](#参考文献)\]，使内积排序与cosin排序等价。
+具体如下：
+- 对于视频向量![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D%5Cin%20%5Cmathbb%7BR%7D%5EN)，有![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Cmathbf%7Bv%7D%20%5Cright%20%5C%7C%5Cleqslant%20m)，变换后的![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D)，![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%3D%20%5B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%3B%20%5Csqrt%7B1%20-%5Cleft%20%5C%7C%20%5Cmathbf%7B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%7B%7D%7D%20%5Cright%20%5C%7C%5E2%7D%5D)。
+- 对于用户向量![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Cin%20%5Cmathbb%7BR%7D%5EN)，变换后的![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D)，![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%3D%20%5B%5Cmathbf%7Bu%7D_%7Bnorm%7D%3B%200%5D)，其中![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D_%7Bnorm%7D)是模长归一化后的![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D)。
+线上对于一个![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D)用内积召回![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D)，作上述变换![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%2C%20%5Cmathbf%7Bv%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D)后，不改变内积排序的顺序。又因为![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%5Cright%20%5C%7C) 和![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%5Cright%20%5C%7C)都为1，因此![](https://www.zhihu.com/equation?tex=cos(%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%2C%5Ctilde%7B%5Cmathbf%7Bv%7D%7D)%20%3D%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Ccdot%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D)，就可以兼容ANN用cosin的方式召回了，结果等价。
+线上使用时，为保留精度，可以不除以![](https://www.zhihu.com/equation?tex=m)，也就变成![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%3D%5B%5Cmathbf%7Bv%7D%3B%5Csqrt%7Bm%5E2-%5Cleft%5C%7C%20%5Cmathbf%7B%5Cmathbf%7Bv%7D%7D%5Cright%5C%7C%5E2%7D%5D)，排序依然等价。
+### 实现
+可使用`user_vector.py`获取用户向量， 输入用户特征经过网络预测，probs[1]中存储的是最后一个RELU层的输出，先前拼接一个1，再做SIMPLE-LSH变换（后接一个0，归一化）：
+```python
+probs = inferer.infer(
+        input=test_batch,
+        feeding=feeding,
+        field=["value"],
+        flatten_result=False)
+for i, res in enumerate(zip(probs[1])):
+    # do simple lsh conversion
+    user_vector = [1.000]
+    for i in res[0]:
+        user_vector.append(i)
+    user_vector.append(0.000)
+    norm = np.linalg.norm(user_vector)
+    user_vector_norm = [str(_ / norm) for _ in user_vector]
+    print ",".join(user_vector_norm)
+```
+可使用`item_vector.py`分别获视频向量。加载模型，提取参数nce_w和nce_b，拼接M个视频向量，第i个视频向量的第一维是对应的nce_b[0][i]，后面是nce_w[i][1:31]。再做SIMPLE-LSH变换，找到所有向量最大的模，按照![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%3D%5B%5Cmathbf%7Bv%7D%3B%5Csqrt%7Bm%5E2-%5Cleft%5C%7C%20%5Cmathbf%7B%5Cmathbf%7Bv%7D%7D%5Cright%5C%7C%5E2%7D%5D)处理。
+```python
+# load the trained model.
+    with gzip.open(args.model_path) as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    nce_w = parameters.get("nce_w")
+    nce_b = parameters.get("nce_b")
+    item_vector = convt_simple_lsh(get_item_vec_from_softmax(nce_w, nce_b))
+def get_item_vec_from_softmax(nce_w, nce_b):
+    """
+    get item vectors from softmax parameter
+    """
+    if nce_w is None or nce_b is None:
+        return None
+    vector = []
+    total_items_num = nce_w.shape[0]
+    if total_items_num != nce_b.shape[1]:
+        return None
+    dim_vector = nce_w.shape[1] + 1
+    for i in range(0, total_items_num):
+        vector.append([])
+        vector[i].append(nce_b[0][i])
+        for j in range(1, dim_vector):
+            vector[i].append(nce_w[i][j - 1])
+    return vector
+def convt_simple_lsh(vector):
+    """
+    do simple lsh conversion
+    """
+    max_norm = 0
+    num_of_vec = len(vector)
+    for i in range(0, num_of_vec):
+        norm = np.linalg.norm(vector[i])
+        if norm > max_norm:
+            max_norm = norm
+    for i in range(0, num_of_vec):
+        vector[i].append(
+            math.sqrt(
+                math.pow(max_norm, 2) - math.pow(np.linalg.norm(vector[i]), 2)))
+    return vector
+```
+可执行下列命令运行脚本：
+```shell
+python user_vector.py --infer_set_path='./data/infer.txt' \
+        --model_path='./output/model/model_pass_00000.tar.gz' \
+            --feature_dict='./output/feature_dict.pkl' \
+                --batch_size=50
+python item_vector.py --model_path='./output/model/model_pass_00000.tar.gz' \
+            --feature_dict='./output/feature_dict.pkl'
+```
+## 离线挖掘
+因为实时召回需要大量机器资源，这边也可以离线挖掘产出数据，线上召回使用挖掘好的数据。可以产出最热，用户个性化，视频相关等数据。下面的示例产出了用户个性化数据。
+```
+python infer_user.py --model_path='./output/model/model_pass_00000.tar.gz' \
+            --feature_dict='./output/feature_dict.pkl'
+```
+## 参考文献
+1. Covington, Paul, Jay Adams, and Emre Sargin. "Deep neural networks for youtube recommendations." Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016.
+2. https://code.google.com/archive/p/word2vec/
+3. http://paddlepaddle.org/docs/develop/models/nce_cost/README.html
+4. Neyshabur, Behnam, and Nathan Srebro. "On symmetric and asymmetric LSHs for inner product search." arXiv preprint arXiv:1410.5518 (2014).
--- a/youtube_recall/README.md
+++ b/youtube_recall/README.md
+# Deep Neural Networks for YouTube Recommendations
+## Introduction\[[1](#References)\]
+YouTube is the world's largest platform for creating, sharing and discovering video content. Youtube recommendations are responsible for helping more than a billion users discover personalized content from an ever-growing corpus of videos.
+- Scale: Many existing recommendation algorithm proven to work well on small problems fail to operate on massive scale. Highly specialized distributed learning algorithms and efficient serving systems are essential.
+- Freshness: YouTube has a very dynamic corpus with many hours of video are uploaded per second. The recommendation system should model newly uploaded content as well as the latest actions taken by user.
+- Noise: Historical user behavior on YouTube is inherently difficult to predict due to sparsity and a variety of unobservable external factors. Furthermore, the noisy implicit feedback signals instead of the ground truth of user satisfaction is observed, and metadata associated with content is poorly structured, which forces the algorithms to be robust.
+The overall structure of the recommendation system is illustrated in Figure 1.
+<p align="center">
+<img src="images/recommendation_system.png" width="500" height="300" hspace='10'/> <br/>
+Figure 1. Recommendation system architecture[1]
+</p>
+The system is comprised of two neural networks: one for candidate generation and one for ranking.
+- The candidate generation network: It takes events from the user's YouTube activity history as input and retrieves a small subset(hundreds) of videos, highly relevant to the user, from a large corpus. The similarity between users is expressed in terms of coarse features such as IDs of video watches, search query tokens and demographics.
+- The ranking network: It accomplishes this task by assigning a score to each video according to a desired objective function using a rich set of features describing the video and user.
+This markdown describes the principle and use of the candidate generation network in detail.
+## Candidate Generation
+Here, candidate generation is modeled as extreme multiclass classification where the prediction problem becomes accurately classifying a specific video watch ![](https://www.zhihu.com/equation?tex=%5Comega_t) at time ![](https://www.zhihu.com/equation?tex=t) among millions of video ![](https://www.zhihu.com/equation?tex=i) (classes) from a corpus ![](https://www.zhihu.com/equation?tex=V) based on user ![](https://www.zhihu.com/equation?tex=U) and context ![](https://www.zhihu.com/equation?tex=C),
+![](https://www.zhihu.com/equation?tex=%24P(%5Comega_t%3Di%7CU%2CC)%3D%5Cfrac%7Be%5E%7B%5Cmathbf%7Bv_i%7D%5Cmathbf%7Bu%7D%7D%7D%7B%5Csum_%7Bj%5Cin%20V%7D%5E%7B%20%7De%5E%7B%5Cmathbf%7Bv_j%7D%5Cmathbf%7Bu%7D%7D%7D)
+where ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Cin%20%5Cmathbb%7BR%7D%5EN) represents a high-dimensional "embedding" of the user, context pair and the ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv_j%7D%5Cin%20%5Cmathbb%7BR%7D%5EN) represent embeddings of each candidate video. The task of the deep neural network is to learn user embeddings ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D) as a function of the user's history and context that are useful for discriminating among videos with a softmax classifier.
+Figure 2 shows the general network architecture of candidate generation model:
+<p align="center">
+<img src="images/model_network.png" width="600" height="500" hspace='10'/> <br/>
+Figure 2. Candidate generation model architecture[1]
+</p>
+- Input layer: A user's watch history is represented by a variable-length sequence of sparse video IDs, and search history is similarly represented by a variable-length sequence of search tokens.
+- Embedding layer: The input features each is mapped to a fixed-sized dense vector representation via the embeddings, and then simply averaging the embeddings. The embeddings are learned jointly with all other model parameters through normal gradient descent back-propagation updates.
+- Hidden layer: Features are concatenated into a wide first layer, followed by several layers of fully connected Rectified Linear Units (ReLU). The output of the last ReLU layer is the previous mentioned high-dimensional "embedding" of the user ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D), so called user vector.
+- Output layer: A softmax classifier is connected to do discriminating millions of classes (videos). To speed up training process, a technique is applied that samples negative classes from background distribution with importance weighting. The previous mentioned high-dimensional "embedding" of the candidate video ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D) is obtained by weight and bias of the softmax layer. At serving time, the most likely N classes (videos) is computed for presenting to the user. To Score millions of items under a strict serving laterncy, the scoring problem reduces to a nearest neighbor search in the dot product space, and Locality Sensitive Hashing is relied on.
+## Data Pre-processing
+In this example, here moke the click log of users as sample data, and its format is as follows:
+```
+user-id \t province \t city \t history-clicked-video-info-sequence \t phone
+history-clicked-video-info-sequence is formated as
+video-info1;video-info2;...;video-infoK
+video-info is formated as
+video-id:category:tag1_tag2_tag3_...tagM
+For example:
+USER_ID_15  Shanghai  Shanghai    VIDEO_42:CATEGORY_9:TAG115;VIDEO_43:CATEGORY_9:TAG116_TAG115;VIDEO_44:CATEGORY_2:TAG117_TAG71  GO T5
+```
+Run this code in `youtube_recall` directory (the same below) to prepare the sample data.
+```
+cd data
+tar -zxvf data.tar
+```
+Then, run `data_preprocess.py` for data pre-processiong. Refer to the following instructions：
+```
+usage: data_processor.py [-h] --train_set_path TRAIN_SET_PATH --output_dir
+                         OUTPUT_DIR [--feat_appear_limit FEAT_APPEAR_LIMIT]
+PaddlePaddle Deep Candidate Generation Example
+optional arguments:
+  -h, --help            show this help message and exit
+  --train_set_path TRAIN_SET_PATH
+                        path of the train set
+  --output_dir OUTPUT_DIR
+                        directory to output
+  --feat_appear_limit FEAT_APPEAR_LIMIT
+                        the minimum number of feature values appears (default:
+                        20)
+```
+The fucntion of this script is as follows:
+- Filter low-frequency features\[[2](#References)\], which appears less than `feat_appear_limit` times.
+- Encode features, and generate dictionary `feature_dict.pkl`.
+- Count the probability of each video appears and write into `item_freq.pkl`, and provide it to NCE layer.
+For example, run the following command to accomplish data pre-processing:
+```
+mkdir output
+python data_processor.py --train_set_path=./data/train.txt \
+                                     --output_dir=./output \
+                                     --feat_appear_limit=20
+```
+## Model Implementaion
+The details of model implementation is illustrated as follows. The code is in `./network_conf.py`.
+### Input layer
+```python
+def _build_input_layer(self):
+    """
+    build input layer
+    """
+    self._history_clicked_items = paddle.layer.data(
+        name="history_clicked_items", type=paddle.data_type.integer_value_sequence(
+            len(self._feature_dict['history_clicked_items'])))
+    self._history_clicked_categories = paddle.layer.data(
+        name="history_clicked_categories", type=paddle.data_type.integer_value_sequence(
+            len(self._feature_dict['history_clicked_categories'])))
+    self._history_clicked_tags = paddle.layer.data(
+        name="history_clicked_tags", type=paddle.data_type.integer_value_sequence(
+            len(self._feature_dict['history_clicked_tags'])))
+    self._user_id = paddle.layer.data(
+        name="user_id", type=paddle.data_type.integer_value(
+            len(self._feature_dict['user_id'])))
+    self._province = paddle.layer.data(
+        name="province", type=paddle.data_type.integer_value(
+            len(self._feature_dict['province'])))
+    self._city = paddle.layer.data(
+        name="city", type=paddle.data_type.integer_value(len(self._feature_dict['city'])))
+    self._phone = paddle.layer.data(
+        name="phone", type=paddle.data_type.integer_value(len(self._feature_dict['phone'])))
+    self._target_item = paddle.layer.data(
+        name="target_item", type=paddle.data_type.integer_value(
+            len(self._feature_dict['history_clicked_items'])))
+```
+### Embedding layer
+The each of input features is mapped to a fixed-sized dense vector representation
+```python
+def _create_emb_attr(self, name):
+    """
+    create embedding parameter
+    """
+    return paddle.attr.Param(
+        name=name, initial_std=0.001, learning_rate=1, l2_rate=0, sparse_update=False)
+def _build_embedding_layer(self):
+    """
+    build embedding layer
+    """
+    self._user_id_emb = paddle.layer.embedding(input=self._user_id,
+                                               size=64,
+                                               param_attr=self._create_emb_attr(
+                                                   '_proj_user_id'))
+    self._province_emb = paddle.layer.embedding(input=self._province,
+                                                size=8,
+                                                param_attr=self._create_emb_attr(
+                                                    '_proj_province'))
+    self._city_emb = paddle.layer.embedding(input=self._city,
+                                            size=16,
+                                            param_attr=self._create_emb_attr('_proj_city'))
+    self._phone_emb = paddle.layer.embedding(input=self._phone,
+                                             size=16,
+                                             param_attr=self._create_emb_attr('_proj_phone'))
+    self._history_clicked_items_emb = paddle.layer.embedding(
+        input=self._history_clicked_items,
+        size=64,
+        param_attr=self._create_emb_attr('_proj_history_clicked_items'))
+    self._history_clicked_categories_emb = paddle.layer.embedding(
+        input=self._history_clicked_categories,
+        size=8,
+        param_attr=self._create_emb_attr('_proj_history_clicked_categories'))
+    self._history_clicked_tags_emb = paddle.layer.embedding(
+        input=self._history_clicked_tags,
+        size=64,
+        param_attr=self._create_emb_attr('_proj_history_clicked_tags'))
+```
+### Hiddern layer
+Here improves the original networks in \[[Original Paper](#References)\](Covington, Paul, Jay Adams, and Emre Sargin. "Deep neural networks for youtube recommendations." Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016.)
+- By modifying that the embeddings of video watches are not simply averaged but are connected to a LSTM layer with max temporal pooling instead, so that the deep sequential information related to user interests can be learned well.
+- Considering data scale and efficiency of training, only two ReLU layers are applied, which also leads to good performance.
+```python
+self._rnn_cell = paddle.networks.simple_lstm(input=self._history_clicked_items_emb, size=64)
+self._lstm_last = paddle.layer.pooling(
+    input=self._rnn_cell, pooling_type=paddle.pooling.Max())
+self._avg_emb_cats = paddle.layer.pooling(input=self._history_clicked_categories_emb,
+                                          pooling_type=paddle.pooling.Avg())
+self._avg_emb_tags = paddle.layer.pooling(input=self._history_clicked_tags_emb,
+                                          pooling_type=paddle.pooling.Avg())
+self._fc_0 = paddle.layer.fc(
+    name="Relu1",
+    input=[self._lstm_last, self._user_id_emb,
+           self._city_emb, self._phone_emb],
+    size=self._dnn_layer_dims[0],
+    act=paddle.activation.Relu())
+self._fc_1 = paddle.layer.fc(
+    name="Relu2",
+    input=self._fc_0,
+    size=self._dnn_layer_dims[1],
+    act=paddle.activation.Relu())
+```
+### Output layer
+To speed up training process, Noise-contrastive estimation, NCE\[[3](#references)\] is applied to sample negative classes from background distribution with importance weighting. The previous mentioned `item_freq.pkl`[data pre-processing](#data pre-processing) is used as neg_distribution.
+```python
+return paddle.layer.nce(
+                input=self._fc_1,
+                label=self._target_item,
+                num_classes=len(self._feature_dict['history_clicked_items']),
+                param_attr=paddle.attr.Param(name="nce_w"),
+                bias_attr=paddle.attr.Param(name="nce_b"),
+                num_neg_samples=5,
+                neg_distribution=self._item_freq)
+```
+## Train
+First of all, prepare `reader.py`, the function of which is to convert raw features into encoding id. One piece of train data generates several data instances according to `window_size`, and then is fed into trainer.
+```
+window_size=2
+train data:
+user-id \t province \t city \t video-info1;video-info2;...;video-infoK \t phone
+several data instances:
+user-id,province,city,[<unk>,video-id1],[<unk>,category1],[<unk>,tags1],phone,video-id2
+user-id,province,city,[video-id1,video-id2],[category1,category2],[tags1,tags2],phone,video-id3
+user-id,province,city,[video-id2,video-id3],[category2,category3],[tags2,tags3],phone,video-id4
+......
+```
+The relevant code is as follows:
+```python
+for i in range(1, len(history_clicked_items_all)):
+    start = max(0, i - self._window_size)
+    history_clicked_items = history_clicked_items_all[start:i]
+    history_clicked_categories = history_clicked_categories_all[start:i]
+    history_clicked_tags_str = history_clicked_tags_all[start:i]
+    history_clicked_tags = []
+    for tags_a in history_clicked_tags_str:
+        for tag in tags_a.split("_"):
+            history_clicked_tags.append(int(tag))
+    target_item = history_clicked_items_all[i]
+    yield user_id, province, city, \
+          history_clicked_items, history_clicked_categories, \
+          history_clicked_tags, phone, target_item
+```
+```python
+reader = Reader(feature_dict, args.window_size)
+    trainer.train(
+        paddle.batch(
+            paddle.reader.shuffle(
+                lambda: reader.train(args.train_set_path),
+                buf_size=7000), args.batch_size),
+        num_passes=args.num_passes,
+        feeding=feeding,
+        event_handler=event_handler)
+```
+Then start training.
+```shell
+mkdir output/model
+python train.py --train_set_path='./data/train.txt' \
+    --test_set_path='./data/test.txt' \
+    --model_output_dir='./output/model/' \
+    --feature_dict='./output/feature_dict.pkl' \
+    --item_freq='./output/item_freq.pkl'
+```
+## Offline prediction
+Input user related features, and then get the most likely N videos for user.
+```shell
+python infer.py --infer_set_path='./data/infer.txt' \
+    --model_path='./output/model/model_pass_00000.tar.gz' \
+    --feature_dict='./output/feature_dict.pkl' \
+    --batch_size=50
+```
+## Online prediction
+For online prediction, Approximate Nearest Neighbor(ANN) is adopted to directly recall top N most likely watch video. Here shows how to get user vector and video vector.
+### User Vector
+User vector is the output of the last RELU layer with cascading a constant term 1 in the front. Here the dimension of the last RELU layer is 31, and thus the dimension of user vector is 32.
+![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%3D%5B1%2Cu_1%2Cu_2%2C...%2Cu_%7B31%7D%5D)
+### Video Vector
+Video vector is extracted from the parameters of softmax layer. If there are M different videos, the output of softmax layer will be the probability of click of these M videos.
+![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bo%7D%3D%5Bs_1%2Cs_2%2C...%2Cs_%7BM%7D%5D)
+To get ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bo%7D) from user vector ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D), a ![](https://www.zhihu.com/equation?tex=32%5Ctimes%20M) matrix which consists of the parameters w, b of softmax layer is multiplied. Each column of this matrix is a 32-dim video vector, according to the dictionary order one by one.
+![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Ccdot%20%5Cbegin%7Bbmatrix%7D%0A%20b_1%20%20%26%20b_2%20%26%20%20%5Ccdots%20%26%20b_M%20%5C%5C%20%0A%20w_%7B11%7D%20%26%20w_%7B21%7D%20%26%20%20%5Ccdots%20%20%26%20w_%7BM1%7D%20%5C%5C%20%0A%20w_%7B12%7D%20%26%20w_%7B22%7D%20%26%20%20%20%5Ccdots%20%26%20w_%7BM2%7D%20%20%5C%5C%20%0A%5Cvdots%20%26%20%5Cvdots%20%26%20%20%5Cvdots%20%26%20%5Cvdots%20%5C%5C%20%0Aw_%7B131%7D%20%26%20%20w_%7B231%7D%20%26%20%20%5Ccdots%20%20%26%20w_%7BM31%7D%20%20%0A%5Cend%7Bbmatrix%7D_%7B32%5Ctimes%20M%7D%20%3D%20%5Cmathbf%7Bu%7D%20%5Ccdot%20%20%5Cbegin%7Bbmatrix%7D%20%0A%5Cmathbf%7Bv_1%7D%2C%20%5Cmathbf%7Bv_2%7D%2C%20%5Ccdots%2C%20%5Cmathbf%7Bv_M%7D%20%0A%5Cend%7Bbmatrix%7D_%7B1%5Ctimes%20M%7D%3D%5Cmathbf%7Bo%7D)
+### SIMPLE-LSH conversion
+However, most of ANN systems currently only support cosin sorting, not by inner product sorting, which leads to big effect difference.
+To solve it, user and video vectors are sliently modified by a SIMPLE-LSH conversion\[[4](#References)\], so that inner sorting is equivalent to cosin sorting after conversion.
+Details are as follows:
+- For video vector ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D%5Cin%20%5Cmathbb%7BR%7D%5EN), ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Cmathbf%7Bv%7D%20%5Cright%20%5C%7C%5Cleqslant%20m). The modified video vector ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D), and let ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%3D%20%5B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%3B%20%5Csqrt%7B1%20-%5Cleft%20%5C%7C%20%5Cmathbf%7B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%7B%7D%7D%20%5Cright%20%5C%7C%5E2%7D%5D).
+- For user vector ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Cin%20%5Cmathbb%7BR%7D%5EN), and the modified user vector ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D), and let ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%3D%20%5B%5Cmathbf%7Bu%7D_%7Bnorm%7D%3B%200%5D), where ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D_%7Bnorm%7D) is normalized ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D).
+When online predicting, for a coming ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D), it should recall ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D) by inner product sorting. After ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%2C%20%5Cmathbf%7Bv%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D) conversion, the order of inner prodct sorting is unchanged. Since ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%5Cright%20%5C%7C) and ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%5Cright%20%5C%7C) are both equal to 1, ![](https://www.zhihu.com/equation?tex=cos(%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%2C%5Ctilde%7B%5Cmathbf%7Bv%7D%7D)%20%3D%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Ccdot%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D), which makes cosin-supported-only ANN system works.
+And in order to retain precision, use ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%3D%5B%5Cmathbf%7Bv%7D%3B%5Csqrt%7Bm%5E2-%5Cleft%5C%7C%20%5Cmathbf%7B%5Cmathbf%7Bv%7D%7D%5Cright%5C%7C%5E2%7D%5D) is also equivalent.
+### Implemention
+Run `user_vector.py` to generate user vector. First input the features into network and then infer. The output of the last RELU layer is saved in variable probs[1]. By cascading a contant term 1 in the front and making SIMPLE-LSH conversion, user vector is generated.
+```python
+probs = inferer.infer(
+        input=test_batch,
+        feeding=feeding,
+        field=["value"],
+        flatten_result=False)
+for i, res in enumerate(zip(probs[1])):
+    # do simple lsh conversion
+    user_vector = [1.000]
+    for i in res[0]:
+        user_vector.append(i)
+    user_vector.append(0.000)
+    norm = np.linalg.norm(user_vector)
+    user_vector_norm = [str(_ / norm) for _ in user_vector]
+    print ",".join(user_vector_norm)
+```
+Run `item_vector.py` to generate video vector. First load the model and extract the parameters nce_w and nce_b. And then generate ith video vector by putting nce_b[0][i] in the first dimension and nce_b[0][i] in the next. Finally make SIMPLE-LSH conversion, finding the maximum norm and processing according to ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%3D%5B%5Cmathbf%7Bv%7D%3B%5Csqrt%7Bm%5E2-%5Cleft%5C%7C%20%5Cmathbf%7B%5Cmathbf%7Bv%7D%7D%5Cright%5C%7C%5E2%7D%5D).
+```python
+# load the trained model.
+    with gzip.open(args.model_path) as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    nce_w = parameters.get("nce_w")
+    nce_b = parameters.get("nce_b")
+    item_vector = convt_simple_lsh(get_item_vec_from_softmax(nce_w, nce_b))
+def get_item_vec_from_softmax(nce_w, nce_b):
+    """
+    get item vectors from softmax parameter
+    """
+    if nce_w is None or nce_b is None:
+        return None
+    vector = []
+    total_items_num = nce_w.shape[0]
+    if total_items_num != nce_b.shape[1]:
+        return None
+    dim_vector = nce_w.shape[1] + 1
+    for i in range(0, total_items_num):
+        vector.append([])
+        vector[i].append(nce_b[0][i])
+        for j in range(1, dim_vector):
+            vector[i].append(nce_w[i][j - 1])
+    return vector
+def convt_simple_lsh(vector):
+    """
+    do simple lsh conversion
+    """
+    max_norm = 0
+    num_of_vec = len(vector)
+    for i in range(0, num_of_vec):
+        norm = np.linalg.norm(vector[i])
+        if norm > max_norm:
+            max_norm = norm
+    for i in range(0, num_of_vec):
+        vector[i].append(
+            math.sqrt(
+                math.pow(max_norm, 2) - math.pow(np.linalg.norm(vector[i]), 2)))
+    return vector
+```
+Use `user_vector.py` and `item_vector.py` to calculate user and item vectors. For example, run the following commands:
+```shell
+python user_vector.py --infer_set_path='./data/infer.txt' \
+        --model_path='./output/model/model_pass_00000.tar.gz' \
+            --feature_dict='./output/feature_dict.pkl' \
+                --batch_size=50
+python item_vector.py --model_path='./output/model/model_pass_00000.tar.gz' \
+            --feature_dict='./output/feature_dict.pkl'
+```
+## Offline data mining
+Since it is inevitable to consume large amount of machine resources for online predicting, an alternative is offline data mining, e.g. hottest videos, user personalized recommendation, item-based recommendation, and online systems directly access it. Here shows an example to get user personalized recommendation.
+```
+python infer_user.py --model_path='./output/model/model_pass_00000.tar.gz' \
+            --feature_dict='./output/feature_dict.pkl'
+```
+## References
+1. Covington, Paul, Jay Adams, and Emre Sargin. "Deep neural networks for youtube recommendations." Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016.
+2. https://code.google.com/archive/p/word2vec/
+3. http://paddlepaddle.org/docs/develop/models/nce_cost/README.html
+4. Neyshabur, Behnam, and Nathan Srebro. "On symmetric and asymmetric LSHs for inner product search." arXiv preprint arXiv:1410.5518 (2014).
--- a/youtube_recall/data/data.tar
+++ b/youtube_recall/data/data.tar
--- a/youtube_recall/data_processor.py
+++ b/youtube_recall/data_processor.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import sys
+import argparse
+import os
+import cPickle
+from utils import logger
+"""
+This script will output 2 files:
+1. feature_dict.pkl
+2. item_freq.pkl
+"""
+class FeatureGenerator(object):
+    """
+    Encode feature values with low-frequency filtering.
+    """
+    def __init__(self, feat_appear_limit=20):
+        """
+        @feat_appear_limit: int
+        """
+        self._dic = None  # feature value --> id
+        self._count = None  # numbers of appearances of feature values
+        self._feat_appear_limit = feat_appear_limit
+    def add_feat_val(self, feat_val):
+        """
+        Add feature values and count numbers of its appearance. 
+        """
+        if self._count is None:
+            self._count = {'<unk>': 0}
+        if feat_val == "NULL":
+            feat_val = '<unk>'
+        if feat_val not in self._count:
+            self._count[feat_val] = 1
+        else:
+            self._count[feat_val] += 1
+            self._count['<unk>'] += 1
+    def _filter_feat(self):
+        """
+        Filter low-frequency feature values.
+        """
+        self._items = filter(lambda x: x[1] > self._feat_appear_limit,
+                             self._count.items())
+        self._items.sort(key=lambda x: x[1], reverse=True)
+    def _build_dict(self):
+        """
+        Build feature values --> ids dict.
+        """
+        self._dic = {}
+        self._filter_feat()
+        for i in xrange(len(self._items)):
+            self._dic[self._items[i][0]] = i
+        self.dim = len(self._dic)
+    def get_feat_id(self, feat_val):
+        """
+        Get id of feature value after encoding.
+        """
+        # build dict
+        if self._dic is None:
+            self._build_dict()
+        # find id
+        if feat_val in self._dic:
+            return self._dic[feat_val]
+        else:
+            return self._dic['<unk>']
+    def get_dim(self):
+        """
+        Get dim.
+        """
+        # build dict
+        if self._dic is None:
+            self._build_dict()
+        return len(self._dic)
+    def get_dict(self):
+        """
+        Get dict.
+        """
+        # build dict
+        if self._dic is None:
+            self._build_dict()
+        return self._dic
+    def get_total_count(self):
+        """
+        Compute total num of count.
+        """
+        total_count = 0
+        for i in xrange(len(self._items)):
+            feat_val = self._items[i][0]
+            c = self._items[i][1]
+            total_count += c
+        return total_count
+    def count_iterator(self):
+        """
+        Iterate feature values and its num of appearance.
+        """
+        for i in xrange(len(self._items)):
+            yield self._items[i][0], self._items[i][1]
+    def __repr__(self):
+        """
+        """
+        return '<FeatureGenerator %d>' % self._dim
+def scan_build_dict(data_path, features_dict):
+    """
+    Scan the raw data and add all feature values.
+    """
+    logger.info('scan data set')
+    with open(data_path, 'r') as f:
+        for (line_id, line) in enumerate(f):
+            fields = line.strip('\n').split('\t')
+            user_id = fields[0]
+            province = fields[1]
+            features_dict['province'].add_feat_val(province)
+            city = fields[2]
+            features_dict['city'].add_feat_val(city)
+            item_infos = fields[3]
+            phone = fields[4]
+            features_dict['phone'].add_feat_val(phone)
+            for item_info in item_infos.split(";"):
+                item_info_array = item_info.split(":")
+                item = item_info_array[0]
+                features_dict['history_clicked_items'].add_feat_val(item)
+                features_dict['user_id'].add_feat_val(user_id)
+                category = item_info_array[1]
+                features_dict['history_clicked_categories'].add_feat_val(
+                    category)
+                tags = item_info_array[2]
+                for tag in tags.split("_"):
+                    features_dict['history_clicked_tags'].add_feat_val(tag)
+def parse_args():
+    """
+    parse arguments
+    """
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Youtube Recall Model Example")
+    parser.add_argument(
+        '--train_set_path',
+        type=str,
+        required=True,
+        help="path of the train set")
+    parser.add_argument(
+        '--output_dir', type=str, required=True, help="directory to output")
+    parser.add_argument(
+        '--feat_appear_limit',
+        type=int,
+        default=20,
+        help="the minimum number of feature values appears (default: 20)")
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = parse_args()
+    # check argument
+    assert os.path.exists(
+        args.train_set_path), 'The train set path does not exist.'
+    # features used
+    features = [
+        'user_id', 'province', 'city', 'phone', 'history_clicked_items',
+        'history_clicked_tags', 'history_clicked_categories'
+    ]
+    # init feature generators
+    features_dict = {}
+    for feature in features:
+        features_dict[feature] = FeatureGenerator(
+            feat_appear_limit=args.feat_appear_limit)
+    # scan data for building dict
+    scan_build_dict(args.train_set_path, features_dict)
+    # generate feature_dict.pkl
+    feature_encoding_dict = {}
+    for feature in features:
+        d = features_dict[feature].get_dict()
+        feature_encoding_dict[feature] = d
+        logger.info('Feature:%s, dimension is %d' % (feature, len(d)))
+    output_dict_path = os.path.join(args.output_dir, 'feature_dict.pkl')
+    with open(output_dict_path, "w") as f:
+        cPickle.dump(feature_encoding_dict, f, -1)
+    # generate item_freq.pkl
+    item_freq_list = []
+    g = features_dict['history_clicked_items']
+    total_count = g.get_total_count()
+    for feat_val, feat_count in g.count_iterator():
+        item_freq_list.append(float(feat_count) / total_count)
+    logger.info('item_freq, dimension is %d' % (len(item_freq_list)))
+    output_item_freq_path = os.path.join(args.output_dir, 'item_freq.pkl')
+    with open(output_item_freq_path, "w") as f:
+        cPickle.dump(item_freq_list, f, -1)
+    logger.info('Complete!')
--- a/youtube_recall/images/model_network.png
+++ b/youtube_recall/images/model_network.png
--- a/youtube_recall/images/recommendation_system.png
+++ b/youtube_recall/images/recommendation_system.png
--- a/youtube_recall/infer.py
+++ b/youtube_recall/infer.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import gzip
+import paddle.v2 as paddle
+import argparse
+import cPickle
+from reader import Reader
+from network_conf import DNNmodel
+from utils import logger
+def parse_args():
+    """
+    parse arguments
+    :return:
+    """
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Youtube Recall Model Example")
+    parser.add_argument(
+        '--infer_set_path',
+        type=str,
+        required=True,
+        help="path of the infer set")
+    parser.add_argument(
+        '--model_path', type=str, required=True, help="path of the model")
+    parser.add_argument(
+        '--feature_dict',
+        type=str,
+        required=True,
+        help="path of feature_dict.pkl")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=50,
+        help="size of mini-batch (default:50)")
+    return parser.parse_args()
+def infer():
+    """
+    infer
+    """
+    args = parse_args()
+    # check argument
+    assert os.path.exists(
+        args.infer_set_path), 'The infer_set_path path does not exist.'
+    assert os.path.exists(
+        args.model_path), 'The model_path path does not exist.'
+    assert os.path.exists(
+        args.feature_dict), 'The feature_dict path does not exist.'
+    paddle.init(use_gpu=False, trainer_count=1)
+    with open(args.feature_dict) as f:
+        feature_dict = cPickle.load(f)
+    nid_dict = feature_dict['history_clicked_items']
+    nid_to_word = dict((v, k) for k, v in nid_dict.items())
+    # load the trained model.
+    with gzip.open(args.model_path) as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    # build model
+    prediction_layer, fc = DNNmodel(
+        dnn_layer_dims=[256, 31], feature_dict=feature_dict,
+        is_infer=True).model_cost
+    inferer = paddle.inference.Inference(
+        output_layer=[prediction_layer, fc], parameters=parameters)
+    reader = Reader(feature_dict)
+    test_batch = []
+    for idx, item in enumerate(reader.infer(args.infer_set_path)):
+        test_batch.append(item)
+        if len(test_batch) == args.batch_size:
+            infer_a_batch(inferer, test_batch, nid_to_word)
+            test_batch = []
+    if len(test_batch):
+        infer_a_batch(inferer, test_batch, nid_to_word)
+def infer_a_batch(inferer, test_batch, nid_to_word):
+    """
+    input a batch of data and infer 
+    """
+    feeding = {
+        'user_id': 0,
+        'province': 1,
+        'city': 2,
+        'history_clicked_items': 3,
+        'history_clicked_categories': 4,
+        'history_clicked_tags': 5,
+        'phone': 6
+    }
+    probs = inferer.infer(
+        input=test_batch,
+        feeding=feeding,
+        field=["value"],
+        flatten_result=False)
+    for i, res in enumerate(zip(test_batch, probs[0], probs[1])):
+        softmax_output = res[1]
+        sort_nid = res[1].argsort()
+        # print top 30 recommended item 
+        ret = ""
+        for j in range(1, 30):
+            item_id = sort_nid[-1 * j]
+            item_id_to_word = nid_to_word[item_id]
+            ret += "%s:%.6f," \
+                    % (item_id_to_word, softmax_output[item_id])
+        print ret.rstrip(",")
+if __name__ == "__main__":
+    infer()
--- a/youtube_recall/infer_user.py
+++ b/youtube_recall/infer_user.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import gzip
+import paddle.v2 as paddle
+import argparse
+import cPickle
+from reader import Reader
+from network_conf import DNNmodel
+from utils import logger
+import numpy as np
+def parse_args():
+    """
+    parse arguments
+    :return:
+    """
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Youtube Recall Model Example")
+    parser.add_argument(
+        '--model_path', type=str, required=True, help="path of the model")
+    parser.add_argument(
+        '--feature_dict',
+        type=str,
+        required=True,
+        help="path of feature_dict.pkl")
+    return parser.parse_args()
+def infer_user():
+    """
+    infer_user
+    """
+    args = parse_args()
+    # check argument
+    assert os.path.exists(
+        args.model_path), 'The model_path path does not exist.'
+    assert os.path.exists(
+        args.feature_dict), 'The feature_dict path does not exist.'
+    paddle.init(use_gpu=False, trainer_count=1)
+    with open(args.feature_dict) as f:
+        feature_dict = cPickle.load(f)
+    nid_dict = feature_dict['history_clicked_items']
+    nid_to_word = dict((v, k) for k, v in nid_dict.items())
+    # load the trained model.
+    with gzip.open(args.model_path) as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    parameters.set('_proj_province', \
+            np.zeros(shape=parameters.get('_proj_province').shape))
+    parameters.set('_proj_city', \
+            np.zeros(shape=parameters.get('_proj_city').shape))
+    parameters.set('_proj_phone', \
+            np.zeros(shape=parameters.get('_proj_phone').shape))
+    parameters.set('_proj_history_clicked_items', \
+            np.zeros(shape= parameters.get('_proj_history_clicked_items').shape))
+    parameters.set('_proj_history_clicked_categories', \
+            np.zeros(shape= parameters.get('_proj_history_clicked_categories').shape))
+    parameters.set('_proj_history_clicked_tags', \
+            np.zeros(shape= parameters.get('_proj_history_clicked_tags').shape))
+    # build model
+    prediction_layer, fc = DNNmodel(
+        dnn_layer_dims=[256, 31], feature_dict=feature_dict,
+        is_infer=True).model_cost
+    inferer = paddle.inference.Inference(
+        output_layer=[prediction_layer, fc], parameters=parameters)
+    reader = Reader(feature_dict)
+    test_batch = []
+    for idx, item in enumerate(
+            reader.infer_user(['USER_ID_0', 'USER_ID_981', 'USER_ID_310806'])):
+        test_batch.append(item)
+    infer_a_batch(inferer, test_batch, nid_to_word)
+def infer_a_batch(inferer, test_batch, nid_to_word):
+    """
+    input a batch of data and infer 
+    """
+    feeding = {
+        'user_id': 0,
+        'province': 1,
+        'city': 2,
+        'history_clicked_items': 3,
+        'history_clicked_categories': 4,
+        'history_clicked_tags': 5,
+        'phone': 6
+    }
+    probs = inferer.infer(
+        input=test_batch,
+        feeding=feeding,
+        field=["value"],
+        flatten_result=False)
+    for i, res in enumerate(zip(test_batch, probs[0], probs[1])):
+        softmax_output = res[1]
+        sort_nid = res[1].argsort()
+        # print top 30 recommended item 
+        ret = ""
+        for j in range(1, 30):
+            item_id = sort_nid[-1 * j]
+            item_id_to_word = nid_to_word[item_id]
+            ret += "%s:%.6f," \
+                    % (item_id_to_word, softmax_output[item_id])
+        print ret.rstrip(",")
+if __name__ == "__main__":
+    infer_user()
--- a/youtube_recall/item_vector.py
+++ b/youtube_recall/item_vector.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import gzip
+import paddle.v2 as paddle
+import argparse
+import cPickle
+from reader import Reader
+from network_conf import DNNmodel
+from utils import logger
+import numpy as np
+import math
+def parse_args():
+    """
+    parse arguments
+    :return:
+    """
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Youtube Recall Model Example")
+    parser.add_argument(
+        '--model_path', type=str, required=True, help="path of the model")
+    parser.add_argument(
+        '--feature_dict',
+        type=str,
+        required=True,
+        help="path of feature_dict.pkl")
+    return parser.parse_args()
+def get_item_vec_from_softmax(nce_w, nce_b):
+    """
+    get item vectors from softmax parameter 
+    """
+    if nce_w is None or nce_b is None:
+        return None
+    vector = []
+    total_items_num = nce_w.shape[0]
+    if total_items_num != nce_b.shape[1]:
+        return None
+    dim_vector = nce_w.shape[1] + 1
+    for i in range(0, total_items_num):
+        vector.append([])
+        vector[i].append(nce_b[0][i])
+        for j in range(1, dim_vector):
+            vector[i].append(nce_w[i][j - 1])
+    return vector
+def convt_simple_lsh(vector):
+    """
+    do simple lsh conversion
+    """
+    max_norm = 0
+    num_of_vec = len(vector)
+    for i in range(0, num_of_vec):
+        norm = np.linalg.norm(vector[i])
+        if norm > max_norm:
+            max_norm = norm
+    for i in range(0, num_of_vec):
+        vector[i].append(
+            math.sqrt(
+                math.pow(max_norm, 2) - math.pow(np.linalg.norm(vector[i]), 2)))
+    return vector
+def item_vector():
+    """
+    get item vectors
+    """
+    args = parse_args()
+    # check argument
+    assert os.path.exists(
+        args.model_path), 'The model_path path does not exist.'
+    assert os.path.exists(
+        args.feature_dict), 'The feature_dict path does not exist.'
+    paddle.init(use_gpu=False, trainer_count=1)
+    with open(args.feature_dict) as f:
+        feature_dict = cPickle.load(f)
+    # load the trained model.
+    with gzip.open(args.model_path) as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    nid_dict = feature_dict['history_clicked_items']
+    nid_to_word = dict((v, k) for k, v in nid_dict.items())
+    nce_w = parameters.get("nce_w")
+    nce_b = parameters.get("nce_b")
+    item_vector = convt_simple_lsh(get_item_vec_from_softmax(nce_w, nce_b))
+    for i in range(0, len(item_vector)):
+        itemid = nid_to_word[i]
+        print itemid + "\t" + ",".join(map(str, item_vector[i]))
+if __name__ == "__main__":
+    item_vector()
--- a/youtube_recall/network_conf.py
+++ b/youtube_recall/network_conf.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import paddle.v2 as paddle
+import cPickle
+class DNNmodel(object):
+    """
+    Deep Neural Networks for YouTube candidate generation
+    """
+    def __init__(self,
+                 dnn_layer_dims=None,
+                 feature_dict=None,
+                 item_freq=None,
+                 is_infer=False):
+        """
+        initialize model
+        @dnn_layer_dims: dimension of each hidden layer
+        @feature_dict: dictionary of encoded feature
+        @item_freq: dictionary of feature values and its frequency
+        @is_infer: if infer mode
+        """
+        self._dnn_layer_dims = dnn_layer_dims
+        self._feature_dict = feature_dict
+        self._item_freq = item_freq
+        self._is_infer = is_infer
+        # build model
+        self._build_input_layer()
+        self._build_embedding_layer()
+        self.model_cost = self._build_dnn_model()
+    def _build_input_layer(self):
+        """
+        build input layer
+        """
+        self._history_clicked_items = paddle.layer.data(
+            name="history_clicked_items",
+            type=paddle.data_type.integer_value_sequence(
+                len(self._feature_dict['history_clicked_items'])))
+        self._history_clicked_categories = paddle.layer.data(
+            name="history_clicked_categories",
+            type=paddle.data_type.integer_value_sequence(
+                len(self._feature_dict['history_clicked_categories'])))
+        self._history_clicked_tags = paddle.layer.data(
+            name="history_clicked_tags",
+            type=paddle.data_type.integer_value_sequence(
+                len(self._feature_dict['history_clicked_tags'])))
+        self._user_id = paddle.layer.data(
+            name="user_id",
+            type=paddle.data_type.integer_value(
+                len(self._feature_dict['user_id'])))
+        self._province = paddle.layer.data(
+            name="province",
+            type=paddle.data_type.integer_value(
+                len(self._feature_dict['province'])))
+        self._city = paddle.layer.data(
+            name="city",
+            type=paddle.data_type.integer_value(
+                len(self._feature_dict['city'])))
+        self._phone = paddle.layer.data(
+            name="phone",
+            type=paddle.data_type.integer_value(
+                len(self._feature_dict['phone'])))
+        self._target_item = paddle.layer.data(
+            name="target_item",
+            type=paddle.data_type.integer_value(
+                len(self._feature_dict['history_clicked_items'])))
+    def _create_emb_attr(self, name):
+        """
+        create embedding parameter
+        """
+        return paddle.attr.Param(
+            name=name,
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=False)
+    def _build_embedding_layer(self):
+        """
+        build embedding layer
+        """
+        self._user_id_emb = paddle.layer.embedding(
+            input=self._user_id,
+            size=64,
+            param_attr=self._create_emb_attr('_proj_user_id'))
+        self._province_emb = paddle.layer.embedding(
+            input=self._province,
+            size=8,
+            param_attr=self._create_emb_attr('_proj_province'))
+        self._city_emb = paddle.layer.embedding(
+            input=self._city,
+            size=16,
+            param_attr=self._create_emb_attr('_proj_city'))
+        self._phone_emb = paddle.layer.embedding(
+            input=self._phone,
+            size=16,
+            param_attr=self._create_emb_attr('_proj_phone'))
+        self._history_clicked_items_emb = paddle.layer.embedding(
+            input=self._history_clicked_items,
+            size=64,
+            param_attr=self._create_emb_attr('_proj_history_clicked_items'))
+        self._history_clicked_categories_emb = paddle.layer.embedding(
+            input=self._history_clicked_categories,
+            size=8,
+            param_attr=self._create_emb_attr(
+                '_proj_history_clicked_categories'))
+        self._history_clicked_tags_emb = paddle.layer.embedding(
+            input=self._history_clicked_tags,
+            size=64,
+            param_attr=self._create_emb_attr('_proj_history_clicked_tags'))
+    def _build_dnn_model(self):
+        """
+        build dnn model
+        """
+        self._rnn_cell = paddle.networks.simple_lstm(
+            input=self._history_clicked_items_emb, size=64)
+        self._lstm_last = paddle.layer.pooling(
+            input=self._rnn_cell, pooling_type=paddle.pooling.Max())
+        self._avg_emb_cats = paddle.layer.pooling(
+            input=self._history_clicked_categories_emb,
+            pooling_type=paddle.pooling.Avg())
+        self._avg_emb_tags = paddle.layer.pooling(
+            input=self._history_clicked_tags_emb,
+            pooling_type=paddle.pooling.Avg())
+        self._fc_0 = paddle.layer.fc(
+            name="Relu1",
+            input=[
+                self._lstm_last, self._user_id_emb, self._province_emb,
+                self._city_emb, self._avg_emb_cats, self._avg_emb_tags,
+                self._phone_emb
+            ],
+            size=self._dnn_layer_dims[0],
+            act=paddle.activation.Relu())
+        self._fc_1 = paddle.layer.fc(name="Relu2",
+                                     input=self._fc_0,
+                                     size=self._dnn_layer_dims[1],
+                                     act=paddle.activation.Relu())
+        if not self._is_infer:
+            return paddle.layer.nce(
+                input=self._fc_1,
+                label=self._target_item,
+                num_classes=len(self._feature_dict['history_clicked_items']),
+                param_attr=paddle.attr.Param(name="nce_w"),
+                bias_attr=paddle.attr.Param(name="nce_b"),
+                num_neg_samples=5,
+                neg_distribution=self._item_freq)
+        else:
+            self.prediction_layer = paddle.layer.mixed(
+                size=len(self._feature_dict['history_clicked_items']),
+                input=paddle.layer.trans_full_matrix_projection(
+                    self._fc_1, param_attr=paddle.attr.Param(name="nce_w")),
+                act=paddle.activation.Softmax(),
+                bias_attr=paddle.attr.Param(name="nce_b"))
+            return self.prediction_layer, self._fc_1
+if __name__ == "__main__":
+    # this is to test and debug the network topology defination.
+    # please set the hyper-parameters as needed.
+    item_freq_path = "./output/item_freq.pkl"
+    with open(item_freq_path) as f:
+        item_freq = cPickle.load(f)
+    feature_dict_path = "./output/feature_dict.pkl"
+    with open(feature_dict_path) as f:
+        feature_dict = cPickle.load(f)
+    a = DNNmodel(
+        dnn_layer_dims=[256, 31],
+        feature_dict=feature_dict,
+        item_freq=item_freq,
+        is_infer=False)
--- a/youtube_recall/reader.py
+++ b/youtube_recall/reader.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import sys
+from utils import logger
+from utils import TaskMode
+class Reader(object):
+    """
+    Reader
+    """
+    def __init__(self, feature_dict=None, window_size=20):
+        """
+        init
+        @window_size: window_size
+        """
+        self._feature_dict = feature_dict
+        self._window_size = window_size
+    def train(self, path):
+        """
+        load train set
+        @path: train set path
+        """
+        logger.info("start train reader from %s" % path)
+        mode = TaskMode.create_train()
+        return self._reader(path, mode)
+    def test(self, path):
+        """
+        load test set
+        @path: test set path
+        """
+        logger.info("start test reader from %s" % path)
+        mode = TaskMode.create_test()
+        return self._reader(path, mode)
+    def infer(self, path):
+        """
+        load infer set
+        @path: infer set path
+        """
+        logger.info("start infer reader from %s" % path)
+        mode = TaskMode.create_infer()
+        return self._reader(path, mode)
+    def infer_user(self, user_list):
+        """
+        load user set to infer
+        @user_list: user list
+        """
+        return self._reader_user(user_list)
+    def _reader(self, path, mode):
+        """
+        parse data set
+        """
+        USER_ID_UNK = self._feature_dict['user_id'].get('<unk>')
+        PROVINCE_UNK = self._feature_dict['province'].get('<unk>')
+        CITY_UNK = self._feature_dict['city'].get('<unk>')
+        ITEM_UNK = self._feature_dict['history_clicked_items'].get('<unk>')
+        CATEGORY_UNK = self._feature_dict['history_clicked_categories'].get(
+            '<unk>')
+        TAG_UNK = self._feature_dict['history_clicked_tags'].get('<unk>')
+        PHONE_UNK = self._feature_dict['phone'].get('<unk>')
+        with open(path) as f:
+            for line in f:
+                fields = line.strip('\n').split('\t')
+                user_id = self._feature_dict['user_id'].get(fields[0],
+                                                            USER_ID_UNK)
+                province = self._feature_dict['province'].get(fields[1],
+                                                              PROVINCE_UNK)
+                city = self._feature_dict['city'].get(fields[2], CITY_UNK)
+                item_infos = fields[3]
+                phone = self._feature_dict['phone'].get(fields[4], PHONE_UNK)
+                history_clicked_items_all = []
+                history_clicked_tags_all = []
+                history_clicked_categories_all = []
+                for item_info in item_infos.split(';'):
+                    item_info_array = item_info.split(':')
+                    item = item_info_array[0]
+                    item_encoded_id = self._feature_dict['history_clicked_items'].get(\
+                            item, ITEM_UNK)
+                    if item_encoded_id != ITEM_UNK:
+                        history_clicked_items_all.append(item_encoded_id)
+                        category = item_info_array[1]
+                        history_clicked_categories_all.append(
+                                self._feature_dict['history_clicked_categories'].get(\
+                                        category, CATEGORY_UNK))
+                        tags = item_info_array[2]
+                        tag_split = map(str, [self._feature_dict['history_clicked_tags'].get(\
+                                tag, TAG_UNK) \
+                                for tag in tags.strip().split("_")])
+                        history_clicked_tags_all.append("_".join(tag_split))
+                if not mode.is_infer():
+                    history_clicked_items_all.insert(0, 0)
+                    history_clicked_tags_all.insert(0, "0")
+                    history_clicked_categories_all.insert(0, 0)
+                    for i in range(1, len(history_clicked_items_all)):
+                        start = max(0, i - self._window_size)
+                        history_clicked_items = history_clicked_items_all[start:
+                                                                          i]
+                        history_clicked_categories = history_clicked_categories_all[
+                            start:i]
+                        history_clicked_tags_str = history_clicked_tags_all[
+                            start:i]
+                        history_clicked_tags = []
+                        for tags_a in history_clicked_tags_str:
+                            for tag in tags_a.split("_"):
+                                history_clicked_tags.append(int(tag))
+                        target_item = history_clicked_items_all[i]
+                        yield user_id, province, city, \
+                              history_clicked_items, history_clicked_categories, \
+                              history_clicked_tags, phone, target_item
+                else:
+                    history_clicked_items = history_clicked_items_all
+                    history_clicked_categories = history_clicked_categories_all
+                    history_clicked_tags_str = history_clicked_tags_all
+                    history_clicked_tags = []
+                    for tags_a in history_clicked_tags_str:
+                        for tag in tags_a.split("_"):
+                            history_clicked_tags.append(int(tag))
+                    yield user_id, province, city, \
+                          history_clicked_items, history_clicked_categories, \
+                          history_clicked_tags, phone
+    def _reader_user(self, user_list):
+        """
+        parse user list
+        """
+        USER_ID_UNK = self._feature_dict['user_id'].get('<unk>')
+        for user in user_list:
+            user_id = self._feature_dict['user_id'].get(user, USER_ID_UNK)
+            yield user_id, 0, 0, [0], [0], [0], 0
+if __name__ == "__main__":
+    # this is to test and debug reader function
+    train_data = sys.argv[1]
+    feature_dict = sys.argv[2]
+    window_size = int(sys.argv[3])
+    import cPickle
+    with open(feature_dict) as f:
+        feature_dict = cPickle.load(f)
+    r = Reader(feature_dict, window_size)
+    for dat in r.train(train_data):
+        print dat
--- a/youtube_recall/train.py
+++ b/youtube_recall/train.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import gzip
+import paddle.v2 as paddle
+import argparse
+import cPickle
+from reader import Reader
+from network_conf import DNNmodel
+from utils import logger
+def parse_args():
+    """
+    parse arguments
+    """
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Youtube Recall Model Example")
+    parser.add_argument(
+        '--train_set_path',
+        type=str,
+        required=True,
+        help="path of the train set")
+    parser.add_argument(
+        '--test_set_path', type=str, required=True, help="path of the test set")
+    parser.add_argument(
+        '--model_output_dir',
+        type=str,
+        required=True,
+        help="directory to output")
+    parser.add_argument(
+        '--feature_dict',
+        type=str,
+        required=True,
+        help="path of feature_dict.pkl")
+    parser.add_argument(
+        '--item_freq', type=str, required=True, help="path of item_freq.pkl ")
+    parser.add_argument(
+        '--window_size', type=int, default=20, help="window size(default: 20)")
+    parser.add_argument(
+        '--num_passes', type=int, default=1, help="number of passes to train")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=50,
+        help="size of mini-batch (default:50)")
+    return parser.parse_args()
+def train():
+    """
+    train
+    """
+    args = parse_args()
+    # check argument
+    assert os.path.exists(
+        args.train_set_path), 'The train_set_path path does not exist.'
+    assert os.path.exists(
+        args.test_set_path), 'The test_set_path path does not exist.'
+    assert os.path.exists(
+        args.feature_dict), 'The feature_dict path does not exist.'
+    assert os.path.exists(args.item_freq), 'The item_freq path does not exist.'
+    assert os.path.exists(
+        args.model_output_dir), 'The model_output_dir path does not exist.'
+    paddle.init(use_gpu=False, trainer_count=1)
+    with open(args.feature_dict) as f:
+        feature_dict = cPickle.load(f)
+    with open(args.item_freq) as f:
+        item_freq = cPickle.load(f)
+    feeding = {
+        'user_id': 0,
+        'province': 1,
+        'city': 2,
+        'history_clicked_items': 3,
+        'history_clicked_categories': 4,
+        'history_clicked_tags': 5,
+        'phone': 6,
+        'target_item': 7
+    }
+    optimizer = paddle.optimizer.AdaGrad(
+        learning_rate=1e-1,
+        regularization=paddle.optimizer.L2Regularization(rate=1e-3))
+    cost = DNNmodel(
+        dnn_layer_dims=[256, 31],
+        feature_dict=feature_dict,
+        item_freq=item_freq,
+        is_infer=False).model_cost
+    parameters = paddle.parameters.create(cost)
+    trainer = paddle.trainer.SGD(cost, parameters, optimizer)
+    def event_handler(event):
+        """
+        event handler
+        """
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id and not event.batch_id % 10:
+                logger.info("Pass %d, Batch %d, Cost %f" %
+                            (event.pass_id, event.batch_id, event.cost))
+        elif isinstance(event, paddle.event.EndPass):
+            save_path = os.path.join(args.model_output_dir,
+                                     "model_pass_%05d.tar.gz" % event.pass_id)
+            logger.info("Save model into %s ..." % save_path)
+            with gzip.open(save_path, "w") as f:
+                trainer.save_parameter_to_tar(f)
+    reader = Reader(feature_dict, args.window_size)
+    trainer.train(
+        paddle.batch(
+            paddle.reader.shuffle(
+                lambda: reader.train(args.train_set_path), buf_size=7000),
+            args.batch_size),
+        num_passes=args.num_passes,
+        feeding=feeding,
+        event_handler=event_handler)
+if __name__ == "__main__":
+    train()
--- a/youtube_recall/user_vector.py
+++ b/youtube_recall/user_vector.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import gzip
+import paddle.v2 as paddle
+import argparse
+import cPickle
+from reader import Reader
+from network_conf import DNNmodel
+from utils import logger
+import numpy as np
+def parse_args():
+    """
+    parse arguments
+    """
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Youtube Recall Model Example")
+    parser.add_argument(
+        '--infer_set_path',
+        type=str,
+        required=True,
+        help="path of the infer set")
+    parser.add_argument(
+        '--model_path', type=str, required=True, help="path of the model")
+    parser.add_argument(
+        '--feature_dict',
+        type=str,
+        required=True,
+        help="path of feature_dict.pkl")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=50,
+        help="size of mini-batch (default:50)")
+    return parser.parse_args()
+def user_vector():
+    """
+    get user vectors
+    """
+    args = parse_args()
+    # check argument
+    assert os.path.exists(
+        args.infer_set_path), 'The infer_set_path path does not exist.'
+    assert os.path.exists(
+        args.model_path), 'The model_path path does not exist.'
+    assert os.path.exists(
+        args.feature_dict), 'The feature_dict path does not exist.'
+    paddle.init(use_gpu=False, trainer_count=1)
+    with open(args.feature_dict) as f:
+        feature_dict = cPickle.load(f)
+    # load the trained model.
+    with gzip.open(args.model_path) as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    # build model
+    prediction_layer, fc = DNNmodel(
+        dnn_layer_dims=[256, 31], feature_dict=feature_dict,
+        is_infer=True).model_cost
+    inferer = paddle.inference.Inference(
+        output_layer=[prediction_layer, fc], parameters=parameters)
+    reader = Reader(feature_dict)
+    test_batch = []
+    for idx, item in enumerate(reader.infer(args.infer_set_path)):
+        test_batch.append(item)
+        if len(test_batch) == args.batch_size:
+            get_a_batch_user_vector(inferer, test_batch)
+            test_batch = []
+    if len(test_batch):
+        get_a_batch_user_vector(inferer, test_batch)
+def get_a_batch_user_vector(inferer, test_batch):
+    """
+    input a batch of data and get user vectors
+    """
+    feeding = {
+        'user_id': 0,
+        'province': 1,
+        'city': 2,
+        'history_clicked_items': 3,
+        'history_clicked_categories': 4,
+        'history_clicked_tags': 5,
+        'phone': 6
+    }
+    probs = inferer.infer(
+        input=test_batch,
+        feeding=feeding,
+        field=["value"],
+        flatten_result=False)
+    for i, res in enumerate(zip(probs[1])):
+        # do simple lsh conversion
+        user_vector = [1.000]
+        for i in res[0]:
+            user_vector.append(i)
+        user_vector.append(0.000)
+        norm = np.linalg.norm(user_vector)
+        user_vector_norm = [str(_ / norm) for _ in user_vector]
+        print ",".join(user_vector_norm)
+if __name__ == "__main__":
+    user_vector()
--- a/youtube_recall/utils.py
+++ b/youtube_recall/utils.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import logging
+logging.basicConfig()
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+class TaskMode(object):
+    """
+    TaskMode
+    """
+    TRAIN_MODE = 0
+    TEST_MODE = 1
+    INFER_MODE = 2
+    def __init__(self, mode):
+        """
+        :param mode:
+        """
+        self.mode = mode
+    def is_train(self):
+        """
+        :return:
+        """
+        return self.mode == self.TRAIN_MODE
+    def is_test(self):
+        """
+        :return:
+        """
+        return self.mode == self.TEST_MODE
+    def is_infer(self):
+        """
+        :return:
+        """
+        return self.mode == self.INFER_MODE
+    @staticmethod
+    def create_train():
+        """
+        :return:
+        """
+        return TaskMode(TaskMode.TRAIN_MODE)
+    @staticmethod
+    def create_test():
+        """
+        :return:
+        """
+        return TaskMode(TaskMode.TEST_MODE)
+    @staticmethod
+    def create_infer():
+        """
+        :return:
+        """
+        return TaskMode(TaskMode.INFER_MODE)