add ssr

ff0bd6f5 · zhangwenhui03 · 28aa02df · ff0bd6f5 · ff0bd6f5 · ff0bd6f5
12 changed file
--- a/fluid/PaddleRec/gru4rec/README.md
+++ b/fluid/PaddleRec/gru4rec/README.md
@@ -32,7 +32,15 @@ GRU4REC模型的介绍可以参阅论文[Session-based Recommendations with Recu

 session-based推荐应用场景非常广泛，比如用户的商品浏览、新闻点击、地点签到等序列数据。

-支持三种形式的损失函数, 分别是全词表的cross-entropy, 采负样本的Bayesian Pairwise Ranking和采负样本的Cross-entropy.
+支持三种形式的损失函数, 分别是全词表的cross-entropy, 负采样的Bayesian Pairwise Ranking和负采样的Cross-entropy.
+
+我们基本复现了论文效果，recall@20的效果分别为
+
+全词表 cross entropy : 0.67
+
+负采样 bpr : 0.606
+
+负采样 cross entropy : 0.605


 运行样例程序可跳过'RSC15 数据下载及预处理'部分
@@ -113,30 +121,42 @@ python text2paddle.py raw_train_data/ raw_test_data/ train_data test_data vocab.
 ```

 ## 训练
-'--use_cuda 1' 表示使用gpu, 缺省表示使用cpu '--parallel 1' 表示使用多卡，缺省表示使用单卡

 具体的参数配置可运行
 ```
 python train.py -h
 ```
+全词表cross entropy 训练代码

-GPU 环境
-运行命令开始训练模型。
+gpu 单机单卡训练
+``` bash
+CUDA_VISIBLE_DEVICES=0 python train.py --train_dir train_data --use_cuda 1 --batch_size 50 --model_dir model_output
 ```
-CUDA_VISIBLE_DEVICES=0 python train.py --train_dir train_data/ --use_cuda 1
+
+cpu 单机训练
+``` bash
+python train.py --train_dir train_data --use_cuda 0 --batch_size 50 --model_dir model_output
 ```
-CPU 环境
-运行命令开始训练模型。
+
+gpu 单机多卡训练
+``` bash
+CUDA_VISIBLE_DEVICES=0,1 python train.py --train_dir train_data --use_cuda 1 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 2
 ```
-python train.py --train_dir train_data/
+
+cpu 单机多卡训练
+``` bash
+CPU_NUM=10 python train.py --train_dir train_data --use_cuda 0 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 10
 ```

-bayesian pairwise ranking loss(bpr loss) 训练
+负采样 bayesian pairwise ranking loss(bpr loss) 训练
 ```
 CUDA_VISIBLE_DEVICES=0 python train_sample_neg.py --loss bpr --use_cuda 1
 ```

-请注意CPU环境下运行单机多卡任务（--parallel 1)时，batch_size应大于cpu核数。
+负采样 cross entropy  训练
+```
+CUDA_VISIBLE_DEVICES=0 python train_sample_neg.py --loss ce --use_cuda 1
+```

 ## 自定义网络结构


--- a/fluid/PaddleRec/gru4rec/net.py
+++ b/fluid/PaddleRec/gru4rec/net.py
@@ -178,7 +178,7 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2):
    return src, pos_label, label, cost_sum


-def infer_bpr_network(vocab_size, batch_size, hid_size, dropout=0.2):
+def infer_network(vocab_size, batch_size, hid_size, dropout=0.2):
    src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1)
    emb_src = fluid.layers.embedding(
        input=src, size=[vocab_size, hid_size], param_attr="emb")

--- a/fluid/PaddleRec/ssr/README.md
+++ b/fluid/PaddleRec/ssr/README.md
@@ -16,18 +16,34 @@ Sequence Semantic Retrieval(SSR) Model shares the similar idea with Multi-Rate D
 Dataset preprocessing follows the method of [GRU4Rec Project](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/gru4rec). Note that you should reuse scripts from GRU4Rec project for data preprocessing.

 ## Training
-Before training, you should set PYTHONPATH environment
+
+The command line options for training can be listed by `python train.py -h`
+
+gpu 单机单卡训练
+``` bash
+CUDA_VISIBLE_DEVICES=0 python train.py --train_dir train_data --use_cuda 1 --batch_size 50 --model_dir model_output
 ```
-export PYTHONPATH=./models/fluid:$PYTHONPATH
+
+cpu 单机训练
+``` bash
+python train.py --train_dir train_data --use_cuda 0 --batch_size 50 --model_dir model_output
 ```

-The command line options for training can be listed by `python train.py -h`
+gpu 单机多卡训练
 ``` bash
-python train.py --train_file rsc15_train_tr_paddle.txt
+CUDA_VISIBLE_DEVICES=0,1 python train.py --train_dir train_data --use_cuda 1 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 2
 ```

-## Build Index
-TBA
+cpu 单机多卡训练
+``` bash
+CPU_NUM=10 python train.py --train_dir train_data --use_cuda 0 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 10
+```
+
+多机训练 参考fluid/PaddleRec/gru4rec下的配置

-## Retrieval
-TBA
+## Inference
+
+gpu 预测
+``` bash
+CUDA_VISIBLE_DEVICES=0 python infer.py --test_dir test_data --use_cuda 1 --batch_size 50 --model_dir model_output
+```
--- a/fluid/PaddleRec/ssr/__init__.py
+++ b/fluid/PaddleRec/ssr/__init__.py
--- a/fluid/PaddleRec/ssr/infer.py
+++ b/fluid/PaddleRec/ssr/infer.py
+import sys
+import argparse
+import time
+import math
+import unittest
+import contextlib
+import numpy as np
+import six
+import paddle.fluid as fluid
+import paddle
+import utils
+import nets as net
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("ssr benchmark.")
+    parser.add_argument(
+        '--test_dir', type=str, default='test_data', help='test file address')
+    parser.add_argument(
+        '--vocab_path', type=str, default='vocab.txt', help='vocab path')
+    parser.add_argument(
+        '--start_index', type=int, default='1', help='start index')
+    parser.add_argument(
+        '--last_index', type=int, default='10', help='end index')
+    parser.add_argument(
+        '--model_dir', type=str, default='model_output', help='model dir')
+    parser.add_argument(
+        '--use_cuda', type=int, default='0', help='whether use cuda')
+    parser.add_argument(
+        '--batch_size', type=int, default='50', help='batch_size')
+    parser.add_argument(
+        '--hid_size', type=int, default='128', help='hidden size')
+    parser.add_argument('--emb_size', type=int, default='128', help='emb size')
+    args = parser.parse_args()
+    return args
+
+
+def model(vocab_size, emb_size, hidden_size):
+    user_data = fluid.layers.data(
+        name="user", shape=[1], dtype="int64", lod_level=1)
+    all_item_data = fluid.layers.data(
+        name="all_item", shape=[vocab_size, 1], dtype="int64")
+
+    user_emb = fluid.layers.embedding(
+        input=user_data, size=[vocab_size, emb_size], param_attr="emb.item")
+    all_item_emb = fluid.layers.embedding(
+        input=all_item_data, size=[vocab_size, emb_size], param_attr="emb.item")
+    all_item_emb_re = fluid.layers.reshape(x=all_item_emb, shape=[-1, emb_size])
+
+    user_encoder = net.GrnnEncoder(hidden_size=hidden_size)
+    user_enc = user_encoder.forward(user_emb)
+    user_hid = fluid.layers.fc(input=user_enc,
+                               size=hidden_size,
+                               param_attr='user.w',
+                               bias_attr="user.b")
+    user_exp = fluid.layers.expand(x=user_hid, expand_times=[1, vocab_size])
+    user_re = fluid.layers.reshape(x=user_exp, shape=[-1, hidden_size])
+
+    all_item_hid = fluid.layers.fc(input=all_item_emb_re,
+                                   size=hidden_size,
+                                   param_attr='item.w',
+                                   bias_attr="item.b")
+    cos_item = fluid.layers.cos_sim(X=all_item_hid, Y=user_re)
+    all_pre_ = fluid.layers.reshape(x=cos_item, shape=[-1, vocab_size])
+    pos_label = fluid.layers.data(name="pos_label", shape=[1], dtype="int64")
+    acc = fluid.layers.accuracy(input=all_pre_, label=pos_label, k=20)
+    return acc
+
+
+def infer(args, vocab_size, test_reader):
+    """ inference function """
+    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    emb_size = args.emb_size
+    hid_size = args.hid_size
+    batch_size = args.batch_size
+    model_path = args.model_dir
+    with fluid.scope_guard(fluid.core.Scope()):
+        main_program = fluid.Program()
+        start_up_program = fluid.Program()
+        with fluid.program_guard(main_program, start_up_program):
+            acc = model(vocab_size, emb_size, hid_size)
+            for epoch in xrange(start_index, last_index + 1):
+                copy_program = main_program.clone()
+                model_path = model_dir + "/epoch_" + str(epoch)
+                fluid.io.load_params(
+                    executor=exe, dirname=model_path, main_program=copy_program)
+                accum_num_recall = 0.0
+                accum_num_sum = 0.0
+                t0 = time.time()
+                step_id = 0
+                for data in test_reader():
+                    step_id += 1
+                    user_data, pos_label = utils.infer_data(data, place)
+                    all_item_numpy = np.tile(
+                        np.arange(vocab_size), len(pos_label)).reshape(
+                            len(pos_label), vocab_size, 1)
+                    para = exe.run(copy_program,
+                                   feed={
+                                       "user": user_data,
+                                       "all_item": all_item_numpy,
+                                       "pos_label": pos_label
+                                   },
+                                   fetch_list=[acc.name],
+                                   return_numpy=False)
+
+                    acc_ = para[0]._get_float_element(0)
+                    data_length = len(
+                        np.concatenate(
+                            pos_label, axis=0).astype("int64"))
+                    accum_num_sum += (data_length)
+                    accum_num_recall += (data_length * acc_)
+                    if step_id % 1 == 0:
+                        print("step:%d  " % (step_id),
+                              accum_num_recall / accum_num_sum)
+                t1 = time.time()
+                print("model:%s recall@20:%.3f time_cost(s):%.2f" %
+                      (model_path, accum_num_recall / accum_num_sum, t1 - t0))
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    start_index = args.start_index
+    last_index = args.last_index
+    test_dir = args.test_dir
+    model_dir = args.model_dir
+    batch_size = args.batch_size
+    vocab_path = args.vocab_path
+    use_cuda = True if args.use_cuda else False
+    print("start index: ", start_index, " last_index:", last_index)
+    test_reader, vocab_size = utils.construct_test_data(
+        test_dir, vocab_path, batch_size=args.batch_size)
+    infer(args, vocab_size, test_reader=test_reader)
--- a/fluid/PaddleRec/ssr/nets.py
+++ b/fluid/PaddleRec/ssr/nets.py
@@ -17,28 +17,53 @@ import paddle.fluid.layers.nn as nn
 import paddle.fluid.layers.tensor as tensor
 import paddle.fluid.layers.control_flow as cf
 import paddle.fluid.layers.io as io
-from PaddleRec.multiview_simnet.nets import BowEncoder
-from PaddleRec.multiview_simnet.nets import GrnnEncoder
+
+
+class BowEncoder(object):
+    """ bow-encoder """
+
+    def __init__(self):
+        self.param_name = ""
+
+    def forward(self, emb):
+        return nn.sequence_pool(input=emb, pool_type='sum')
+
+
+class GrnnEncoder(object):
+    """ grnn-encoder """
+
+    def __init__(self, param_name="grnn", hidden_size=128):
+        self.param_name = param_name
+        self.hidden_size = hidden_size
+
+    def forward(self, emb):
+        fc0 = nn.fc(input=emb,
+                    size=self.hidden_size * 3,
+                    param_attr=self.param_name + "_fc.w",
+                    bias_attr=False)
+
+        gru_h = nn.dynamic_gru(
+            input=fc0,
+            size=self.hidden_size,
+            is_reverse=False,
+            param_attr=self.param_name + ".param",
+            bias_attr=self.param_name + ".bias")
+        return nn.sequence_pool(input=gru_h, pool_type='max')


 class PairwiseHingeLoss(object):
    def __init__(self, margin=0.8):
        self.margin = margin
+
    def forward(self, pos, neg):
        loss_part1 = nn.elementwise_sub(
            tensor.fill_constant_batch_size_like(
-                input=pos,
-                shape=[-1, 1],
-                value=self.margin,
-                dtype='float32'),
+                input=pos, shape=[-1, 1], value=self.margin, dtype='float32'),
            pos)
        loss_part2 = nn.elementwise_add(loss_part1, neg)
        loss_part3 = nn.elementwise_max(
            tensor.fill_constant_batch_size_like(
-                input=loss_part2, 
-                shape=[-1, 1],
-                value=0.0, 
-                dtype='float32'),
+                input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
            loss_part2)
        return loss_part3

@@ -61,41 +86,37 @@ class SequenceSemanticRetrieval(object):
        return correct

    def train(self):
-        user_data = io.data(
-            name="user", shape=[1], dtype="int64", lod_level=1
-        )
+        user_data = io.data(name="user", shape=[1], dtype="int64", lod_level=1)
        pos_item_data = io.data(
-            name="p_item", shape=[1], dtype="int64", lod_level=1
-        )
+            name="p_item", shape=[1], dtype="int64", lod_level=1)
        neg_item_data = io.data(
-            name="n_item", shape=[1], dtype="int64", lod_level=1
-        )
+            name="n_item", shape=[1], dtype="int64", lod_level=1)
        user_emb = nn.embedding(
-            input=user_data, size=self.emb_shape, param_attr="emb.item"
-        )
+            input=user_data, size=self.emb_shape, param_attr="emb.item")
        pos_item_emb = nn.embedding(
-            input=pos_item_data, size=self.emb_shape, param_attr="emb.item"
-        )
+            input=pos_item_data, size=self.emb_shape, param_attr="emb.item")
        neg_item_emb = nn.embedding(
-            input=neg_item_data, size=self.emb_shape, param_attr="emb.item"
-        )
+            input=neg_item_data, size=self.emb_shape, param_attr="emb.item")
        user_enc = self.user_encoder.forward(user_emb)
        pos_item_enc = self.item_encoder.forward(pos_item_emb)
        neg_item_enc = self.item_encoder.forward(neg_item_emb)
-        user_hid = nn.fc(
-            input=user_enc, size=self.hidden_size, param_attr='user.w', bias_attr="user.b"
-        )
-        pos_item_hid = nn.fc(
-            input=pos_item_enc, size=self.hidden_size, param_attr='item.w', bias_attr="item.b"
-        )
-        neg_item_hid = nn.fc(
-            input=neg_item_enc, size=self.hidden_size, param_attr='item.w', bias_attr="item.b"
-        )
+        user_hid = nn.fc(input=user_enc,
+                         size=self.hidden_size,
+                         param_attr='user.w',
+                         bias_attr="user.b")
+        pos_item_hid = nn.fc(input=pos_item_enc,
+                             size=self.hidden_size,
+                             param_attr='item.w',
+                             bias_attr="item.b")
+        neg_item_hid = nn.fc(input=neg_item_enc,
+                             size=self.hidden_size,
+                             param_attr='item.w',
+                             bias_attr="item.b")
        cos_pos = nn.cos_sim(user_hid, pos_item_hid)
        cos_neg = nn.cos_sim(user_hid, neg_item_hid)
        hinge_loss = self.pairwise_hinge_loss.forward(cos_pos, cos_neg)
        avg_cost = nn.mean(hinge_loss)
        correct = self.get_correct(cos_neg, cos_pos)

-        return [user_data, pos_item_data, neg_item_data], \
-            pos_item_hid, neg_item_hid, avg_cost, correct
+        return [user_data, pos_item_data,
+                neg_item_data], cos_pos, avg_cost, correct
--- a/fluid/PaddleRec/ssr/reader.py
+++ b/fluid/PaddleRec/ssr/reader.py
@@ -14,14 +14,17 @@

 import random

+
 class Dataset:
    def __init__(self):
        pass

+
 class Vocab:
    def __init__(self):
        pass

+
 class YoochooseVocab(Vocab):
    def __init__(self):
        self.vocab = {}
@@ -47,11 +50,10 @@ class YoochooseVocab(Vocab):
    def _get_word_array(self):
        return self.word_array

+
 class YoochooseDataset(Dataset):
-    def __init__(self, y_vocab):
-        self.vocab_size = len(y_vocab.get_vocab())
-        self.word_array = y_vocab._get_word_array()
-        self.vocab = y_vocab.get_vocab()
+    def __init__(self, vocab_size):
+        self.vocab_size = vocab_size

    def sample_neg(self):
        return random.randint(0, self.vocab_size - 1)
@@ -59,10 +61,6 @@ class YoochooseDataset(Dataset):
    def sample_neg_from_seq(self, seq):
        return seq[random.randint(0, len(seq) - 1)]

-    # TODO(guru4elephant): wait memory, should be improved
-    def sample_from_word_freq(self):
-        return self.word_array[random.randint(0, len(self.word_array) - 1)]
-
    def _reader_creator(self, filelist, is_train):
        def reader():
            for f in filelist:
@@ -72,18 +70,16 @@ class YoochooseDataset(Dataset):
                        ids = line.strip().split()
                        if len(ids) <= 1:
                            continue
-                        conv_ids = [self.vocab[i] if i in self.vocab else 0 for i in ids]
-                        # random select an index as boundary
-                        # make ids before boundary as sequence
-                        # make id next to boundary right as target
-                        boundary = random.randint(1, len(ids) - 1)
+                        conv_ids = [i for i in ids]
+                        boundary = len(ids) - 1
                        src = conv_ids[:boundary]
                        pos_tgt = [conv_ids[boundary]]
                        if is_train:
-                            neg_tgt = [self.sample_from_word_freq()]
+                            neg_tgt = [self.sample_neg()]
                            yield [src, pos_tgt, neg_tgt]
                        else:
                            yield [src, pos_tgt]
+
        return reader

    def train(self, file_list):
@@ -91,4 +87,3 @@ class YoochooseDataset(Dataset):

    def test(self, file_list):
        return self._reader_creator(file_list, False)
-
--- a/fluid/PaddleRec/ssr/test_data/small_test.txt
+++ b/fluid/PaddleRec/ssr/test_data/small_test.txt
+0 16 
+475 473 155 
+491 21 
+96 185 96 
+29 14 13 
+5 481 11 21 470 
+70 5 70 11 
+167 42 167 217 
+72 15 73 161 172 
+82 82 
+97 297 97 
+193 182 186 183 184 177 214 
+152 152 
+163 298 7 
+39 73 71 
+490 23 23 496 488 74 23 74 486 23 23 74 
+17 17 
+170 170 483 444 443 234 
+25 472 
+5 5 11 70 69 
+149 149 455 
+356 68 477 468 17 479 66 
+159 172 6 71 6 6 158 13 494 169 
+155 44 438 144 500 
+156 9 9 
+146 146 
+173 10 10 461 
+7 6 6 
+269 48 268 
+50 100 
+323 174 18 
+69 69 22 98 
+38 171 
+22 29 489 10 
+0 0 
+11 5 
+29 13 14 232 231 451 289 452 229 
+260 11 156 
+166 160 166 39 
+223 134 134 420 
+66 401 68 132 17 84 287 5 
+39 304 
+65 84 132 
+400 211 
+145 144 
+16 28 254 48 50 100 42 154 262 133 17 
+0 0 
+28 28 
+11 476 464 
+61 61 86 86 
+38 38 
+463 478 
+437 265 
+22 39 485 171 98 
+434 51 344 
+16 16 
+67 67 67 448 
+22 12 161 
+15 377 147 147 374 
+119 317 0 
+38 484 
+403 499 
+432 442 
+28 0 16 50 465 42 
+163 487 7 162 
+99 99 325 423 83 83 
+154 133 
+5 37 492 235 160 279 
+10 10 457 493 10 460 
+441 4 4 4 4 4 4 4 
+153 153 
+159 164 164 
+328 37 
+65 65 404 347 431 459 
+80 80 44 44 
+61 446 
+162 495 7 453 
+157 21 204 68 37 66 469 145 
+37 151 230 206 240 205 264 87 409 87 288 270 280 329 157 296 454 474 
+430 445 433 
+449 14 
+9 9 9 9 
+440 238 226 
+148 148 
+266 267 181 
+48 498 
+263 255 256 
+458 158 7 
+72 168 12 165 71 73 173 49 
+0 0 
+7 7 6 
+14 29 13 6 15 14 15 13 
+480 439 21 
+450 21 151 
+12 12 49 14 13 165 12 169 72 15 15 
+91 91 
+22 12 49 168 
+497 101 30 411 30 482 30 53 30 101 176 415 53 447 
+462 150 150 
+471 456 131 435 131 467 436 412 227 218 190 466 429 213 326 
--- a/fluid/PaddleRec/ssr/train.py
+++ b/fluid/PaddleRec/ssr/train.py
@@ -13,87 +13,110 @@
 # limitations under the License.
 import os
 import sys
+import time
 import argparse
 import logging
 import paddle.fluid as fluid
 import paddle
-import reader as reader
+import utils
+import numpy as np
 from nets import SequenceSemanticRetrieval

 logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger("fluid")
 logger.setLevel(logging.INFO)

+
 def parse_args():
    parser = argparse.ArgumentParser("sequence semantic retrieval")
-    parser.add_argument("--train_file", type=str, help="Training file")
-    parser.add_argument("--valid_file", type=str, help="Validation file")
    parser.add_argument(
-        "--epochs", type=int, default=10, help="Number of epochs for training")
+        "--train_dir", type=str, default='train_data', help="Training file")
+    parser.add_argument(
+        "--base_lr", type=float, default=0.01, help="learning rate")
    parser.add_argument(
-        "--model_output_dir",
+        '--vocab_path',
        type=str,
-        default='model_output',
-        help="Model output folder")
+        default='vocab.txt',
+        help='vocab file address')
+    parser.add_argument(
+        "--epochs", type=int, default=10, help="Number of epochs")
+    parser.add_argument(
+        '--parallel', type=int, default=0, help='whether parallel')
    parser.add_argument(
-        "--sequence_encode_dim",
-        type=int,
-        default=128,
-        help="Dimension of sequence encoder output")
+        '--use_cuda', type=int, default=0, help='whether use gpu')
    parser.add_argument(
-        "--matching_dim",
-        type=int,
-        default=128,
-        help="Dimension of hidden layer")
+        '--print_batch', type=int, default=10, help='num of print batch')
    parser.add_argument(
-        "--batch_size", type=int, default=128, help="Batch size for training")
+        '--model_dir', type=str, default='model_output', help='model dir')
    parser.add_argument(
-        "--embedding_dim",
-        type=int,
-        default=128,
-        help="Default Dimension of Embedding")
+        "--hidden_size", type=int, default=128, help="hidden size")
+    parser.add_argument("--batch_size", type=int, default=50, help="batch size")
+    parser.add_argument(
+        "--embedding_dim", type=int, default=128, help="embedding dim")
+    parser.add_argument(
+        '--num_devices', type=int, default=1, help='Number of GPU devices')
    return parser.parse_args()

-def start_train(args):
-    y_vocab = reader.YoochooseVocab()
-    y_vocab.load([args.train_file])

-    logger.info("Load yoochoose vocabulary size: {}".format(len(y_vocab.get_vocab())))
-    y_data = reader.YoochooseDataset(y_vocab)
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            y_data.train([args.train_file]), buf_size=args.batch_size * 100),
-        batch_size=args.batch_size)
-    place = fluid.CPUPlace()
-    ssr = SequenceSemanticRetrieval(
-        len(y_vocab.get_vocab()), args.embedding_dim, args.matching_dim
-    )
-    input_data, user_rep, item_rep, avg_cost, acc = ssr.train()
-    optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
+def get_cards(args):
+    return args.num_devices
+
+
+def train(args):
+    use_cuda = True if args.use_cuda else False
+    parallel = True if args.parallel else False
+    print("use_cuda:", use_cuda, "parallel:", parallel)
+    train_reader, vocab_size = utils.construct_train_data(
+        args.train_dir, args.vocab_path, args.batch_size * get_cards(args))
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    ssr = SequenceSemanticRetrieval(vocab_size, args.embedding_dim,
+                                    args.hidden_size)
+    # Train program
+    train_input_data, cos_pos, avg_cost, acc = ssr.train()
+
+    # Optimization to minimize lost
+    optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
    optimizer.minimize(avg_cost)
-    startup_program = fluid.default_startup_program()
-    loop_program = fluid.default_main_program()
-    data_list = [var.name for var in input_data]
+
+    data_list = [var.name for var in train_input_data]
    feeder = fluid.DataFeeder(feed_list=data_list, place=place)
    exe = fluid.Executor(place)
-    exe.run(startup_program)
+    exe.run(fluid.default_startup_program())
+    if parallel:
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=use_cuda, loss_name=avg_cost.name)
+    else:
+        train_exe = exe

+    total_time = 0.0
    for pass_id in range(args.epochs):
+        epoch_idx = pass_id + 1
+        print("epoch_%d start" % epoch_idx)
+        t0 = time.time()
+        i = 0
        for batch_id, data in enumerate(train_reader()):
-            loss_val, correct_val = exe.run(loop_program,
-                                            feed=feeder.feed(data),
-                                            fetch_list=[avg_cost, acc])
-            logger.info("Train --> pass: {} batch_id: {} avg_cost: {}, acc: {}".
-                        format(pass_id, batch_id, loss_val, 
-                               float(correct_val) / args.batch_size))
-        fluid.io.save_inference_model(args.model_output_dir, 
-                                      [var.name for val in input_data],
-                                      [user_rep, item_rep, avg_cost, acc], exe)
+            i += 1
+            loss_val, correct_val = train_exe.run(
+                feed=feeder.feed(data), fetch_list=[avg_cost.name, acc.name])
+            if i % args.print_batch == 0:
+                logger.info(
+                    "Train --> pass: {} batch_id: {} avg_cost: {}, acc: {}".
+                    format(pass_id, batch_id,
+                           np.mean(loss_val),
+                           float(np.mean(correct_val)) / args.batch_size))
+        t1 = time.time()
+        total_time += t1 - t0
+        print("epoch:%d num_steps:%d time_cost(s):%f" %
+              (epoch_idx, i, total_time / epoch_idx))
+        save_dir = "%s/epoch_%d" % (args.model_dir, epoch_idx)
+        fluid.io.save_params(executor=exe, dirname=save_dir)
+        print("model saved in %s" % save_dir)
+

 def main():
    args = parse_args()
-    start_train(args)
+    train(args)
+

 if __name__ == "__main__":
    main()
-
--- a/fluid/PaddleRec/ssr/train_data/small_train.txt
+++ b/fluid/PaddleRec/ssr/train_data/small_train.txt
+197 196 198 236 
+93 93 384 362 363 43 
+336 364 407 
+421 322 
+314 388 
+128 58 
+138 138 
+46 46 46 
+34 34 57 57 57 342 228 321 346 357 59 376 
+110 110 
+135 94 135 
+27 250 27 
+129 118 
+18 18 18 
+81 81 89 89 
+27 27 
+20 20 20 20 20 212 
+33 33 33 33 
+62 62 62 63 63 55 248 124 381 428 383 382 43 43 261 63 
+90 90 78 78 
+399 397 202 141 104 104 245 192 191 271 
+239 332 283 88 
+187 313 
+136 136 324 
+41 41 
+352 128 
+413 414 
+410 45 45 45 1 1 1 1 1 1 1 1 31 31 31 31 
+92 334 92 
+95 285 
+215 249 
+390 41 
+116 116 
+300 252 
+2 2 2 2 2 
+8 8 8 8 8 8 
+53 241 259 
+118 129 126 94 137 208 216 299 
+209 368 139 418 419 
+311 180 
+303 302 203 284 
+369 32 32 32 32 337 
+207 47 47 47 
+106 107 
+143 143 
+179 178 
+109 109 
+405 79 79 371 246 
+251 417 427 
+333 88 387 358 123 348 394 360 36 365 
+3 3 3 3 3 
+189 188 
+398 425 
+107 406 
+281 201 141 
+2 2 2 
+359 54 
+395 385 293 
+60 60 60 121 121 233 58 58 
+24 199 175 24 24 24 351 386 106 
+115 294 
+122 122 127 127 
+35 35 
+282 393 
+277 140 140 343 225 123 36 36 36 221 114 114 59 59 117 117 247 367 219 258 222 301 375 350 353 111 111 
+275 272 273 274 331 330 305 108 76 76 108 
+26 26 26 408 26 
+290 18 210 291 
+372 139 424 113 
+341 340 335 
+120 370 
+224 200 
+426 416 
+137 319 
+402 55 
+54 54 
+327 119 
+125 125 
+391 396 354 355 389 
+142 142 
+295 320 
+113 366 
+253 85 85 
+56 56 310 309 308 307 278 25 25 19 19 3 312 19 19 19 3 25 
+220 338 
+34 130 
+130 120 380 315 
+339 422 
+379 378 
+95 56 392 115 
+55 124 
+126 34 
+349 373 361 
+195 194 
+75 75 
+64 64 64 
+35 35 
+40 40 40 242 77 244 77 243 
+257 316 
+103 306 102 51 52 103 105 52 52 292 318 112 286 345 237 276 112 51 102 105 
--- a/fluid/PaddleRec/ssr/utils.py
+++ b/fluid/PaddleRec/ssr/utils.py
+import numpy as np
+import reader as reader
+import os
+import logging
+import paddle.fluid as fluid
+import paddle
+
+
+def get_vocab_size(vocab_path):
+    with open(vocab_path, "r") as rf:
+        line = rf.readline()
+        return int(line.strip())
+
+
+def construct_train_data(file_dir, vocab_path, batch_size):
+    vocab_size = get_vocab_size(vocab_path)
+    files = [file_dir + '/' + f for f in os.listdir(file_dir)]
+    y_data = reader.YoochooseDataset(vocab_size)
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            y_data.train(files), buf_size=batch_size * 100),
+        batch_size=batch_size)
+    return train_reader, vocab_size
+
+
+def construct_test_data(file_dir, vocab_path, batch_size):
+    vocab_size = get_vocab_size(vocab_path)
+    files = [file_dir + '/' + f for f in os.listdir(file_dir)]
+    y_data = reader.YoochooseDataset(vocab_size)
+    test_reader = paddle.batch(y_data.test(files), batch_size=batch_size)
+    return test_reader, vocab_size
+
+
+def infer_data(raw_data, place):
+    data = [dat[0] for dat in raw_data]
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    p_label = [dat[1] for dat in raw_data]
+    pos_label = np.array(p_label).astype("int64").reshape(len(p_label), 1)
+    return res, pos_label
--- a/fluid/PaddleRec/ssr/vocab.txt
+++ b/fluid/PaddleRec/ssr/vocab.txt
+501