modify gru4rec format2

6d189911 · frankwhzhang · 14139f8f · 6d189911 · 6d189911 · 6d189911
14 changed file
--- a/fluid/Recommender/gru4rec/data_preprocess.py
+++ b/fluid/Recommender/gru4rec/data_preprocess.py
+"""
+imikolov's simple dataset.
+This module will download dataset from
+http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
+into paddle reader creators.
+"""
+
+from __future__ import print_function
+
+import paddle.dataset.common
+import collections
+import tarfile
+import six
+
+__all__ = ['train', 'test', 'build_dict', 'convert']
+
+
+
+class DataType(object):
+    SEQ = 2
+
+
+def word_count(f, word_freq=None):
+    if word_freq is None:
+        word_freq = collections.defaultdict(int)
+
+    for l in f:
+        for w in l.strip().split():
+            word_freq[w] += 1
+
+    return word_freq
+
+
+def build_dict(min_word_freq=50,train_filename="",test_filename=""):
+    """
+    Build a word dictionary from the corpus,  Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    with open(train_filename) as trainf:
+	with open(test_filename) as testf:
+	    word_freq = word_count(testf, word_count(trainf))
+            
+    if '<unk>' in word_freq:
+            # remove <unk> for now, since we will set it as last index
+        del word_freq['<unk>']
+
+    word_freq = [
+            x for x in six.iteritems(word_freq) if x[1] > min_word_freq
+        ]
+    word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+    words, _ = list(zip(*word_freq_sorted))
+    word_idx = dict(list(zip(words, six.moves.range(len(words)))))
+
+    return word_idx
+
+
+def reader_creator(filename, word_idx, n, data_type):
+    def reader():
+         with open(filename) as f:   
+            for l in f:
+                if DataType.SEQ == data_type:
+           		l = l.strip().split()
+                   	l = [word_idx.get(w) for w in l]
+			src_seq = l[:len(l)-1]
+			trg_seq = l[1:]
+                    	if n > 0 and len(src_seq) > n: continue
+                    	yield src_seq, trg_seq
+                else:
+                    	assert False, 'error data type'
+
+    return reader
+
+
+def train(filename,word_idx, n, data_type=DataType.SEQ):
+    return reader_creator(filename, word_idx, n,
+                          data_type)
+def test(filename,word_idx, n, data_type=DataType.SEQ):
+    return reader_creator(filename, word_idx, n,
+                          data_type)
--- a/fluid/Recommender/gru4rec/infer.py
+++ b/fluid/Recommender/gru4rec/infer.py
+import sys
+import time
+import math
+import unittest
+import contextlib
+import numpy as np
+import six
+import paddle.fluid as fluid
+import paddle
+
+import utils
+
+def infer(test_reader, use_cuda, model_path):
+    """ inference function """
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    with fluid.scope_guard(fluid.core.Scope()):
+        infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(model_path, exe)
+	accum_num_recall = 0.0
+	accum_num_sum = 0.0
+        t0 = time.time()
+	step_id = 0
+        for data in test_reader():
+	    step_id += 1
+            src_wordseq = utils.to_lodtensor([dat[0] for dat in data], place)
+	    label_data = [dat[1] for dat in data]
+	    dst_wordseq = utils.to_lodtensor(label_data, place)
+            para = exe.run(
+                infer_program,
+                feed={"src_wordseq": src_wordseq,
+                      "dst_wordseq": dst_wordseq},
+                fetch_list=fetch_vars,
+		return_numpy=False)
+	     
+	    acc_ = para[1]._get_float_element(0)
+	    data_length = len(np.concatenate(label_data, axis=0).astype("int64"))
+	    accum_num_sum += (data_length)
+	    accum_num_recall += (data_length*acc_)
+	    if step_id % 100 == 0:
+		print("step:%d  " % (step_id), accum_num_recall/accum_num_sum)
+        t1 = time.time()
+        print("model:%s recall@20:%.3f time_cost(s):%.2f" %
+             (model_path, accum_num_recall/accum_num_sum, t1 - t0))
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("Usage: %s model_dir start_epoch last_epoch(inclusive)")
+        exit(0)
+
+    model_dir = sys.argv[1]
+    try:
+        start_index = int(sys.argv[2])
+        last_index = int(sys.argv[3])
+    except:
+        iprint("Usage: %s model_dir start_ipoch last_epoch(inclusive)")
+        exit(-1)
+    train_file = "small_train.txt"
+    test_file = "small_test.txt"
+    vocab, train_reader, test_reader = utils.prepare_data(train_file, test_file,
+        batch_size=5, buffer_size=1000, word_freq_threshold=0)
+
+    for epoch in xrange(start_index, last_index + 1):
+        epoch_path = model_dir + "/epoch_" + str(epoch)
+        infer(test_reader=test_reader, use_cuda=True, model_path=epoch_path)
--- a/fluid/Recommender/gru4rec/small_test.txt
+++ b/fluid/Recommender/gru4rec/small_test.txt
--- a/fluid/Recommender/gru4rec/small_train.txt
+++ b/fluid/Recommender/gru4rec/small_train.txt
--- a/fluid/Recommender/gru4rec/sort_batch.py
+++ b/fluid/Recommender/gru4rec/sort_batch.py
+def batch(reader, batch_size, sort_group_size, drop_last=False):
+    """
+    Create a batched reader.
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param batch_size: size of each mini-batch
+    :type batch_size: int
+    :param sort_group_size: size of partial sorted batch
+    :type sort_group_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
+    :return: the batched reader.
+    :rtype: callable
+    """
+    	
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == sort_group_size:
+                sortl = sorted(b, key=lambda x:len(x[0]), reverse=True)
+		b = []
+		c = []
+		for sort_i in sortl:
+		    c.append(sort_i)
+		    if(len(c) == batch_size):
+		        yield c
+                        c = []
+        if drop_last == False and len(b) != 0:
+            sortl = sorted(b, key=lambda x:len(x[0]), reverse=True)
+	    c = []
+	    for sort_i in sortl:
+		c.append(sort_i)
+		if(len(c) == batch_size):
+		    yield c
+                    c = []
+    # Batch size check
+    batch_size = int(batch_size)
+    if batch_size <= 0:
+        raise ValueError("batch_size should be a positive integeral value, "
+                         "but got batch_size={}".format(batch_size))
+    return batch_reader
--- a/fluid/Recommender/gru4rec/train.py
+++ b/fluid/Recommender/gru4rec/train.py
+import os
+import sys
+import time
+import six
+import numpy as np
+import math
+import argparse
+import paddle.fluid as fluid
+import paddle
+import time
+import utils
+ 
+SEED = 102
+
+def parse_args():
+    parser = argparse.ArgumentParser("gru4rec benchmark.")
+    parser.add_argument(
+        '--enable_ce',
+        action='store_true',
+        help='If set, run \
+        the task with continuous evaluation logs.')
+    parser.add_argument(
+        '--num_devices',
+        type=int,
+        default=1,
+        help='Number of GPU devices')
+    args = parser.parse_args()
+    return args
+
+def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
+    """ network definition """
+    emb_lr_x = 10.0
+    gru_lr_x = 1.0
+    fc_lr_x = 1.0
+    emb = fluid.layers.embedding(
+        input=src,
+        size=[vocab_size, hid_size],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(
+                low=init_low_bound, high=init_high_bound),
+            learning_rate=emb_lr_x),
+        is_sparse=True)
+
+    fc0 = fluid.layers.fc(input=emb,
+                          size=hid_size * 3,
+                          param_attr=fluid.ParamAttr(
+                              initializer=fluid.initializer.Uniform(
+                                  low=init_low_bound, high=init_high_bound),
+                              learning_rate=gru_lr_x))
+    gru_h0 = fluid.layers.dynamic_gru(
+        input=fc0,
+        size=hid_size,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(
+                low=init_low_bound, high=init_high_bound),
+            learning_rate=gru_lr_x))
+
+    fc = fluid.layers.fc(input=gru_h0,
+                         size=vocab_size,
+                         act='softmax',
+                         param_attr=fluid.ParamAttr(
+                             initializer=fluid.initializer.Uniform(
+                                 low=init_low_bound, high=init_high_bound),
+                             learning_rate=fc_lr_x))
+
+    cost = fluid.layers.cross_entropy(input=fc, label=dst)
+    acc = fluid.layers.accuracy(input=fc, label=dst, k=20)
+    return cost, acc
+
+def train(train_reader,
+          vocab,
+          network,
+          hid_size,
+          base_lr,
+          batch_size,
+          pass_num,
+          use_cuda,
+          parallel,
+          model_dir,
+          init_low_bound=-0.04,
+          init_high_bound=0.04):
+    """ train network """
+
+    args = parse_args()
+    if args.enable_ce:
+        # random seed must set before configuring the network.
+        fluid.default_startup_program().random_seed = SEED
+
+    vocab_size = len(vocab)
+
+    # Input data
+    src_wordseq = fluid.layers.data(
+        name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
+    dst_wordseq = fluid.layers.data(
+        name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)
+
+    # Train program
+    avg_cost = None
+    cost, acc = network(src_wordseq, dst_wordseq,
+			vocab_size,hid_size, init_low_bound, init_high_bound)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Optimization to minimize lost
+    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=base_lr)
+    sgd_optimizer.minimize(avg_cost)
+
+    # Initialize executor
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    if parallel :
+        train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=avg_cost.name)
+    else :
+        train_exe = exe
+    total_time = 0.0
+    fetch_list = [avg_cost.name]
+    for pass_idx in six.moves.xrange(pass_num):
+        epoch_idx = pass_idx + 1
+        print "epoch_%d start" % epoch_idx
+
+        t0 = time.time()
+        i = 0
+	newest_ppl = 0
+        for data in train_reader():
+            i += 1
+            lod_src_wordseq = utils.to_lodtensor(
+		[dat[0] for dat in data], place)
+            lod_dst_wordseq = utils.to_lodtensor(
+		[dat[1] for dat in data], place)
+	    ret_avg_cost = train_exe.run(feed={
+                "src_wordseq": lod_src_wordseq,
+                "dst_wordseq": lod_dst_wordseq
+            },
+                fetch_list=fetch_list)
+            avg_ppl = np.exp(ret_avg_cost[0])
+	    newest_ppl = np.mean(avg_ppl)
+            if i % 10 == 0:
+                print "step:%d ppl:%.3f" % (i, newest_ppl)
+
+        t1 = time.time()
+        total_time += t1 - t0
+        print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i,
+                                                         total_time / epoch_idx)
+
+	if pass_idx == pass_num - 1 and args.enable_ce:
+            #Note: The following logs are special for CE monitoring.
+            #Other situations do not need to care about these logs.
+            gpu_num = get_cards(args.enable_ce)
+            if gpu_num == 1:
+                print("kpis	rsc15_pass_duration	%s" %
+                      (total_time / epoch_idx))
+                print("kpis	rsc15_avg_ppl	%s" % newest_ppl)
+            else:
+                print("kpis	rsc15_pass_duration_card%s	%s" % \
+                      (gpu_num, total_time / epoch_idx))
+                print("kpis	rsc15_avg_ppl_card%s	%s" %
+                      (gpu_num, newest_ppl))
+        save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
+        feed_var_names = ["src_wordseq", "dst_wordseq"]
+        fetch_vars = [avg_cost, acc]
+        fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
+        print("model saved in %s" % save_dir)
+
+    print("finish training")
+
+def get_cards(args):
+    if args.enable_ce:
+        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
+        num = len(cards.split(","))
+        return num
+    else:
+        return args.num_devices
+
+def train_net():
+    """ do training """
+    train_file = "small_train.txt"
+    test_file = "small_test.txt"
+    batch_size = 50
+    args = parse_args()
+    vocab, train_reader, test_reader = utils.prepare_data(
+	train_file, test_file,batch_size=batch_size * get_cards(args),\
+	buffer_size=1000, word_freq_threshold=0)
+    train(
+        train_reader=train_reader,
+        vocab=vocab,
+        network=network,
+        hid_size=100,
+        base_lr=0.01,
+        batch_size=batch_size,
+        pass_num=10,
+        use_cuda=True,
+        parallel=False,
+       	model_dir="model_recall20",
+        init_low_bound=-0.1,
+        init_high_bound=0.1)
+
+if __name__ == "__main__":
+    train_net()
--- a/fluid/Recommender/gru4rec/utils.py
+++ b/fluid/Recommender/gru4rec/utils.py
+import sys
+import time
+import numpy as np
+import paddle.fluid as fluid
+import paddle
+import data_preprocess as dp
+import sort_batch as sortb
+
+def to_lodtensor(data, place):
+    """ convert to LODtensor """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+def prepare_data(train_filename, test_filename, batch_size,
+		 buffer_size=1000, word_freq_threshold=0, enable_ce=False):
+    """ prepare the English Pann Treebank (PTB) data """
+    print("start constuct word dict")
+    vocab = dp.build_dict(word_freq_threshold,train_filename,test_filename)
+    print("construct word dict done\n")
+    if enable_ce:
+    	train_reader = paddle.batch(
+            		dp.train(train_filename,
+                	vocab,
+                	buffer_size,
+                	data_type=dp.DataType.SEQ),
+        	batch_size)
+    else:
+	train_reader = sortb.batch(
+                paddle.reader.shuffle(
+                        dp.train(train_filename,
+                        vocab,
+                        buffer_size,
+                        data_type=dp.DataType.SEQ),
+                buf_size=buffer_size),
+                batch_size,batch_size*20)
+    test_reader = sortb.batch(
+        dp.test(test_filename,
+           	vocab, buffer_size, data_type=dp.DataType.SEQ),
+       		batch_size,batch_size*20)
+    return vocab, train_reader, test_reader
--- a/fluid/recommender/gru4rec/data_preprocess.py
+++ b/fluid/recommender/gru4rec/data_preprocess.py
+import collections
+import six
+
+class DataType(object):
+    SEQ = 2
+
+def word_count(input_file, word_freq=None):
+    """
+    compute word count from corpus 
+    """
+    if word_freq is None:
+        word_freq = collections.defaultdict(int)
+
+    for l in input_file:
+        for w in l.strip().split():
+            word_freq[w] += 1
+
+    return word_freq
+
+def build_dict(min_word_freq=50, train_filename="", test_filename=""):
+    """
+    Build a word dictionary from the corpus,  Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    with open(train_filename) as trainf:
+        with open(test_filename) as testf:
+	    word_freq = word_count(testf, word_count(trainf))
+
+    word_freq = [
+            x for x in six.iteritems(word_freq) if x[1] > min_word_freq
+        ]
+    word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+    words, _ = list(zip(*word_freq_sorted))
+    word_idx = dict(list(zip(words, six.moves.range(len(words)))))
+
+    return word_idx
+
+def reader_creator(filename, word_idx, n, data_type):
+    def reader():
+         with open(filename) as f:   
+            for l in f:
+                if DataType.SEQ == data_type:
+           	    l = l.strip().split()
+                    l = [word_idx.get(w) for w in l]
+		    src_seq = l[:len(l)-1]
+		    trg_seq = l[1:]
+                    if n > 0 and len(src_seq) > n: continue
+                    yield src_seq, trg_seq
+                else:
+                    assert False, 'error data type'
+    return reader
+
+def train(filename,word_idx, n, data_type=DataType.SEQ):
+    return reader_creator(filename, word_idx, n, data_type)
+
+def test(filename,word_idx, n, data_type=DataType.SEQ):
+    return reader_creator(filename, word_idx, n, data_type)
--- a/fluid/recommender/gru4rec/infer.py
+++ b/fluid/recommender/gru4rec/infer.py
+import sys
+import time
+import math
+import unittest
+import contextlib
+import numpy as np
+import six
+import paddle.fluid as fluid
+import paddle
+
+import utils
+
+def infer(test_reader, use_cuda, model_path):
+    """ inference function """
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    with fluid.scope_guard(fluid.core.Scope()):
+        infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(model_path, exe)
+	accum_num_recall = 0.0
+	accum_num_sum = 0.0
+        t0 = time.time()
+	step_id = 0
+        for data in test_reader():
+	    step_id += 1
+            src_wordseq = utils.to_lodtensor([dat[0] for dat in data], place)
+	    label_data = [dat[1] for dat in data]
+	    dst_wordseq = utils.to_lodtensor(label_data, place)
+            para = exe.run(
+                infer_program,
+                feed={"src_wordseq": src_wordseq,
+                      "dst_wordseq": dst_wordseq},
+                fetch_list=fetch_vars,
+		return_numpy=False)
+	     
+	    acc_ = para[1]._get_float_element(0)
+	    data_length = len(np.concatenate(label_data, axis=0).astype("int64"))
+	    accum_num_sum += (data_length)
+	    accum_num_recall += (data_length*acc_)
+	    if step_id % 100 == 0:
+		print("step:%d  " % (step_id), accum_num_recall/accum_num_sum)
+        t1 = time.time()
+        print("model:%s recall@20:%.3f time_cost(s):%.2f" %
+             (model_path, accum_num_recall/accum_num_sum, t1 - t0))
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("Usage: %s model_dir start_epoch last_epoch(inclusive)")
+        exit(0)
+
+    model_dir = sys.argv[1]
+    try:
+        start_index = int(sys.argv[2])
+        last_index = int(sys.argv[3])
+    except:
+        iprint("Usage: %s model_dir start_ipoch last_epoch(inclusive)")
+        exit(-1)
+    train_file = "small_train.txt"
+    test_file = "small_test.txt"
+    vocab, train_reader, test_reader = utils.prepare_data(train_file, test_file,
+        batch_size=5, buffer_size=1000, word_freq_threshold=0)
+
+    for epoch in xrange(start_index, last_index + 1):
+        epoch_path = model_dir + "/epoch_" + str(epoch)
+        infer(test_reader=test_reader, use_cuda=True, model_path=epoch_path)
--- a/fluid/recommender/gru4rec/small_test.txt
+++ b/fluid/recommender/gru4rec/small_test.txt
--- a/fluid/recommender/gru4rec/small_train.txt
+++ b/fluid/recommender/gru4rec/small_train.txt
--- a/fluid/recommender/gru4rec/sort_batch.py
+++ b/fluid/recommender/gru4rec/sort_batch.py
+def batch(reader, batch_size, sort_group_size, drop_last=False):
+    """
+    Create a batched reader.
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param batch_size: size of each mini-batch
+    :type batch_size: int
+    :param sort_group_size: size of partial sorted batch
+    :type sort_group_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
+    :return: the batched reader.
+    :rtype: callable
+    """
+    	
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == sort_group_size:
+                sortl = sorted(b, key=lambda x:len(x[0]), reverse=True)
+		b = []
+		c = []
+		for sort_i in sortl:
+		    c.append(sort_i)
+		    if(len(c) == batch_size):
+		        yield c
+                        c = []
+        if drop_last == False and len(b) != 0:
+            sortl = sorted(b, key=lambda x:len(x[0]), reverse=True)
+	    c = []
+	    for sort_i in sortl:
+		c.append(sort_i)
+		if(len(c) == batch_size):
+		    yield c
+                    c = []
+    # Batch size check
+    batch_size = int(batch_size)
+    if batch_size <= 0:
+        raise ValueError("batch_size should be a positive integeral value, "
+                         "but got batch_size={}".format(batch_size))
+    return batch_reader
--- a/fluid/recommender/gru4rec/train.py
+++ b/fluid/recommender/gru4rec/train.py
+import os
+import sys
+import time
+import six
+import numpy as np
+import math
+import argparse
+import paddle.fluid as fluid
+import paddle
+import time
+import utils
+ 
+SEED = 102
+
+def parse_args():
+    parser = argparse.ArgumentParser("gru4rec benchmark.")
+    parser.add_argument(
+        '--enable_ce',
+        action='store_true',
+        help='If set, run \
+        the task with continuous evaluation logs.')
+    parser.add_argument(
+        '--num_devices',
+        type=int,
+        default=1,
+        help='Number of GPU devices')
+    args = parser.parse_args()
+    return args
+
+def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
+    """ network definition """
+    emb_lr_x = 10.0
+    gru_lr_x = 1.0
+    fc_lr_x = 1.0
+    emb = fluid.layers.embedding(
+        input=src,
+        size=[vocab_size, hid_size],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(
+                low=init_low_bound, high=init_high_bound),
+            learning_rate=emb_lr_x),
+        is_sparse=True)
+
+    fc0 = fluid.layers.fc(input=emb,
+                          size=hid_size * 3,
+                          param_attr=fluid.ParamAttr(
+                              initializer=fluid.initializer.Uniform(
+                                  low=init_low_bound, high=init_high_bound),
+                              learning_rate=gru_lr_x))
+    gru_h0 = fluid.layers.dynamic_gru(
+        input=fc0,
+        size=hid_size,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(
+                low=init_low_bound, high=init_high_bound),
+            learning_rate=gru_lr_x))
+
+    fc = fluid.layers.fc(input=gru_h0,
+                         size=vocab_size,
+                         act='softmax',
+                         param_attr=fluid.ParamAttr(
+                             initializer=fluid.initializer.Uniform(
+                                 low=init_low_bound, high=init_high_bound),
+                             learning_rate=fc_lr_x))
+
+    cost = fluid.layers.cross_entropy(input=fc, label=dst)
+    acc = fluid.layers.accuracy(input=fc, label=dst, k=20)
+    return cost, acc
+
+def train(train_reader,
+          vocab,
+          network,
+          hid_size,
+          base_lr,
+          batch_size,
+          pass_num,
+          use_cuda,
+          parallel,
+          model_dir,
+          init_low_bound=-0.04,
+          init_high_bound=0.04):
+    """ train network """
+
+    args = parse_args()
+    if args.enable_ce:
+        # random seed must set before configuring the network.
+        fluid.default_startup_program().random_seed = SEED
+
+    vocab_size = len(vocab)
+
+    # Input data
+    src_wordseq = fluid.layers.data(
+        name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
+    dst_wordseq = fluid.layers.data(
+        name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)
+
+    # Train program
+    avg_cost = None
+    cost, acc = network(src_wordseq, dst_wordseq,
+			vocab_size,hid_size, init_low_bound, init_high_bound)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Optimization to minimize lost
+    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=base_lr)
+    sgd_optimizer.minimize(avg_cost)
+
+    # Initialize executor
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    if parallel :
+        train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=avg_cost.name)
+    else :
+        train_exe = exe
+    total_time = 0.0
+    fetch_list = [avg_cost.name]
+    for pass_idx in six.moves.xrange(pass_num):
+        epoch_idx = pass_idx + 1
+        print "epoch_%d start" % epoch_idx
+
+        t0 = time.time()
+        i = 0
+	newest_ppl = 0
+        for data in train_reader():
+            i += 1
+            lod_src_wordseq = utils.to_lodtensor(
+		[dat[0] for dat in data], place)
+            lod_dst_wordseq = utils.to_lodtensor(
+		[dat[1] for dat in data], place)
+	    ret_avg_cost = train_exe.run(feed={
+                "src_wordseq": lod_src_wordseq,
+                "dst_wordseq": lod_dst_wordseq
+            },
+                fetch_list=fetch_list)
+            avg_ppl = np.exp(ret_avg_cost[0])
+	    newest_ppl = np.mean(avg_ppl)
+            if i % 10 == 0:
+                print "step:%d ppl:%.3f" % (i, newest_ppl)
+
+        t1 = time.time()
+        total_time += t1 - t0
+        print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i,
+                                                         total_time / epoch_idx)
+
+	if pass_idx == pass_num - 1 and args.enable_ce:
+            #Note: The following logs are special for CE monitoring.
+            #Other situations do not need to care about these logs.
+            gpu_num = get_cards(args.enable_ce)
+            if gpu_num == 1:
+                print("kpis	rsc15_pass_duration	%s" %
+                      (total_time / epoch_idx))
+                print("kpis	rsc15_avg_ppl	%s" % newest_ppl)
+            else:
+                print("kpis	rsc15_pass_duration_card%s	%s" % \
+                      (gpu_num, total_time / epoch_idx))
+                print("kpis	rsc15_avg_ppl_card%s	%s" %
+                      (gpu_num, newest_ppl))
+        save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
+        feed_var_names = ["src_wordseq", "dst_wordseq"]
+        fetch_vars = [avg_cost, acc]
+        fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
+        print("model saved in %s" % save_dir)
+
+    print("finish training")
+
+def get_cards(args):
+    if args.enable_ce:
+        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
+        num = len(cards.split(","))
+        return num
+    else:
+        return args.num_devices
+
+def train_net():
+    """ do training """
+    train_file = "small_train.txt"
+    test_file = "small_test.txt"
+    batch_size = 50
+    args = parse_args()
+    vocab, train_reader, test_reader = utils.prepare_data(
+	train_file, test_file,batch_size=batch_size * get_cards(args),\
+	buffer_size=1000, word_freq_threshold=0)
+    train(
+        train_reader=train_reader,
+        vocab=vocab,
+        network=network,
+        hid_size=100,
+        base_lr=0.01,
+        batch_size=batch_size,
+        pass_num=10,
+        use_cuda=True,
+        parallel=False,
+       	model_dir="model_recall20",
+        init_low_bound=-0.1,
+        init_high_bound=0.1)
+
+if __name__ == "__main__":
+    train_net()
--- a/fluid/recommender/gru4rec/utils.py
+++ b/fluid/recommender/gru4rec/utils.py
+import sys
+import time
+import numpy as np
+import paddle.fluid as fluid
+import paddle
+import data_preprocess as dp
+import sort_batch as sortb
+
+def to_lodtensor(data, place):
+    """ convert to LODtensor """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+def prepare_data(train_filename, test_filename, batch_size,
+		 buffer_size=1000, word_freq_threshold=0, enable_ce=False):
+    """ prepare the English Pann Treebank (PTB) data """
+    print("start constuct word dict")
+    vocab = dp.build_dict(word_freq_threshold,train_filename,test_filename)
+    print("construct word dict done\n")
+    if enable_ce:
+    	train_reader = paddle.batch(
+            		dp.train(train_filename,
+                	vocab,
+                	buffer_size,
+                	data_type=dp.DataType.SEQ),
+        	batch_size)
+    else:
+	train_reader = sortb.batch(
+                paddle.reader.shuffle(
+                        dp.train(train_filename,
+                        vocab,
+                        buffer_size,
+                        data_type=dp.DataType.SEQ),
+                buf_size=buffer_size),
+                batch_size,batch_size*20)
+    test_reader = sortb.batch(
+        dp.test(test_filename,
+           	vocab, buffer_size, data_type=dp.DataType.SEQ),
+       		batch_size,batch_size*20)
+    return vocab, train_reader, test_reader