Update2.0 model (#4905)

* update api 1.8 * fix paddlerec readme * update 20 , test=develop

Update2.0 model (#4905)
* update api 1.8 * fix paddlerec readme * update 20 , test=develop
5f187850 · zhang wenhui · GitHub · 3fad507e · 5f187850 · 5f187850
4 changed file
--- a/PaddleRec/ctr/deepfm_dygraph/data/aid_data/train_file_idx.txt
+++ b/PaddleRec/ctr/deepfm_dygraph/data/aid_data/train_file_idx.txt
+[156, 51, 24, 103, 195, 35, 188, 16, 224, 173, 116, 3, 226, 11, 64, 94, 6, 70, 197, 164, 220, 77, 172, 194, 227, 12, 65, 129, 39, 38, 75, 210, 215, 36, 46, 185, 76, 222, 108, 78, 120, 71, 33, 189, 135, 97, 90, 219, 105, 205, 136, 167, 106, 29, 157, 125, 217, 121, 175, 143, 200, 45, 179, 37, 86, 140, 225, 47, 20, 228, 4, 209, 177, 178, 171, 58, 48, 118, 9, 149, 55, 192, 82, 17, 43, 54, 93, 96, 159, 216, 18, 206, 223, 104, 132, 182, 60, 109, 28, 180, 44, 166, 128, 27, 163, 141, 229, 102, 150, 7, 83, 198, 41, 191, 114, 117, 122, 161, 130, 174, 176, 160, 201, 49, 112, 69, 165, 95, 133, 92, 59, 110, 151, 203, 67, 169, 21, 66, 80, 22, 23, 152, 40, 127, 111, 186, 72, 26, 190, 42, 0, 63, 53, 124, 137, 85, 126, 196, 187, 208, 98, 25, 15, 170, 193, 168, 202, 31, 146, 147, 113, 32, 204, 131, 68, 84, 213, 19, 81, 79, 162, 199, 107, 50, 2, 207, 10, 181, 144, 139, 134, 62, 155, 142, 214, 212, 61, 52, 101, 99, 158, 145, 13, 153, 56, 184, 221]
\ No newline at end of file
--- a/PaddleRec/ctr/deepfm_dygraph/data/download_preprocess.py
+++ b/PaddleRec/ctr/deepfm_dygraph/data/download_preprocess.py
+import os
+import shutil
+import sys
+
+LOCAL_PATH = os.path.dirname(os.path.abspath(__file__))
+TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools")
+sys.path.append(TOOLS_PATH)
+
+from tools import download_file_and_uncompress, download_file
+
+if __name__ == '__main__':
+    url = "https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz"
+    url2 = "https://paddlerec.bj.bcebos.com/deepfm%2Ffeat_dict_10.pkl2"
+
+    print("download and extract starting...")
+    download_file_and_uncompress(url)
+    if not os.path.exists("aid_data"):
+        os.makedirs("aid_data")
+    download_file(url2, "./aid_data/feat_dict_10.pkl2", True)
+    print("download and extract finished")
+
+    print("preprocessing...")
+    os.system("python preprocess.py")
+    print("preprocess done")
+
+    shutil.rmtree("raw_data")
+    print("done")
--- a/PaddleRec/ctr/deepfm_dygraph/data/preprocess.py
+++ b/PaddleRec/ctr/deepfm_dygraph/data/preprocess.py
+from __future__ import division
+import os
+import numpy
+from collections import Counter
+import shutil
+import pickle
+
+
+def get_raw_data(intput_file, raw_data, ins_per_file):
+    if not os.path.isdir(raw_data):
+        os.mkdir(raw_data)
+
+    fin = open(intput_file, 'r')
+    fout = open(os.path.join(raw_data, 'part-0'), 'w')
+    for line_idx, line in enumerate(fin):
+        if line_idx % ins_per_file == 0 and line_idx != 0:
+            fout.close()
+            cur_part_idx = int(line_idx / ins_per_file)
+            fout = open(
+                os.path.join(raw_data, 'part-' + str(cur_part_idx)), 'w')
+        fout.write(line)
+    fout.close()
+    fin.close()
+
+
+def split_data(raw_data, aid_data, train_data, test_data):
+    split_rate_ = 0.9
+    dir_train_file_idx_ = os.path.join(aid_data, 'train_file_idx.txt')
+    filelist_ = [
+        os.path.join(raw_data, 'part-%d' % x)
+        for x in range(len(os.listdir(raw_data)))
+    ]
+
+    if not os.path.exists(dir_train_file_idx_):
+        train_file_idx = list(
+            numpy.random.choice(
+                len(filelist_), int(len(filelist_) * split_rate_), False))
+        with open(dir_train_file_idx_, 'w') as fout:
+            fout.write(str(train_file_idx))
+    else:
+        with open(dir_train_file_idx_, 'r') as fin:
+            train_file_idx = eval(fin.read())
+
+    for idx in range(len(filelist_)):
+        if idx in train_file_idx:
+            shutil.move(filelist_[idx], train_data)
+        else:
+            shutil.move(filelist_[idx], test_data)
+
+
+def get_feat_dict(intput_file, aid_data, print_freq=100000, total_ins=45000000):
+    freq_ = 10
+    dir_feat_dict_ = os.path.join(aid_data, 'feat_dict_' + str(freq_) + '.pkl2')
+    continuous_range_ = range(1, 14)
+    categorical_range_ = range(14, 40)
+
+    if not os.path.exists(dir_feat_dict_):
+        # print('generate a feature dict')
+        # Count the number of occurrences of discrete features
+        feat_cnt = Counter()
+        with open(intput_file, 'r') as fin:
+            for line_idx, line in enumerate(fin):
+                if line_idx % print_freq == 0:
+                    print(r'generating feature dict {:.2f} %'.format((
+                        line_idx / total_ins) * 100))
+                features = line.rstrip('\n').split('\t')
+                for idx in categorical_range_:
+                    if features[idx] == '': continue
+                    feat_cnt.update([features[idx]])
+
+        # Only retain discrete features with high frequency
+        dis_feat_set = set()
+        for feat, ot in feat_cnt.items():
+            if ot >= freq_:
+                dis_feat_set.add(feat)
+
+        # Create a dictionary for continuous and discrete features
+        feat_dict = {}
+        tc = 1
+        # Continuous features
+        for idx in continuous_range_:
+            feat_dict[idx] = tc
+            tc += 1
+        for feat in dis_feat_set:
+            feat_dict[feat] = tc
+            tc += 1
+        # Save dictionary
+        with open(dir_feat_dict_, 'wb') as fout:
+            pickle.dump(feat_dict, fout, protocol=2)
+        print('args.num_feat ', len(feat_dict) + 1)
+
+
+def preprocess(input_file,
+               outdir,
+               ins_per_file,
+               total_ins=None,
+               print_freq=None):
+    train_data = os.path.join(outdir, "train_data")
+    test_data = os.path.join(outdir, "test_data")
+    aid_data = os.path.join(outdir, "aid_data")
+    raw_data = os.path.join(outdir, "raw_data")
+    if not os.path.isdir(train_data):
+        os.mkdir(train_data)
+    if not os.path.isdir(test_data):
+        os.mkdir(test_data)
+    if not os.path.isdir(aid_data):
+        os.mkdir(aid_data)
+
+    if print_freq is None:
+        print_freq = 10 * ins_per_file
+
+    get_raw_data(input_file, raw_data, ins_per_file)
+    split_data(raw_data, aid_data, train_data, test_data)
+    get_feat_dict(input_file, aid_data, print_freq, total_ins)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    preprocess('train.txt', './', 200000, 45000000)
--- a/PaddleRec/gru4rec/dy_graph/gru4rec_dy.py
+++ b/PaddleRec/gru4rec/dy_graph/gru4rec_dy.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.dygraph.nn import Embedding
-import paddle.fluid.framework as framework
-from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.optimizer import AdagradOptimizer
-from paddle.fluid.dygraph.base import to_variable
-import numpy as np
-import six
-
-import reader
-import model_check
-import time
-
-from args import *
-
-import sys
-if sys.version[0] == '2':
-    reload(sys)
-    sys.setdefaultencoding("utf-8")
-
-
-class SimpleGRURNN(fluid.Layer):
-    def __init__(self,
-                 hidden_size,
-                 num_steps,
-                 num_layers=2,
-                 init_scale=0.1,
-                 dropout=None):
-        super(SimpleGRURNN, self).__init__()
-        self._hidden_size = hidden_size
-        self._num_layers = num_layers
-        self._init_scale = init_scale
-        self._dropout = dropout
-        self._num_steps = num_steps
-
-        self.weight_1_arr = []
-        self.weight_2_arr = []
-        self.weight_3_arr = []
-        self.bias_1_arr = []
-        self.bias_2_arr = []
-        self.mask_array = []
-
-        for i in range(self._num_layers):
-            weight_1 = self.create_parameter(
-                attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
-                        low=-self._init_scale, high=self._init_scale)),
-                shape=[self._hidden_size * 2, self._hidden_size * 2],
-                dtype="float32",
-                default_initializer=fluid.initializer.UniformInitializer(
-                    low=-self._init_scale, high=self._init_scale))
-            self.weight_1_arr.append(self.add_parameter('w1_%d' % i, weight_1))
-            weight_2 = self.create_parameter(
-                attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
-                        low=-self._init_scale, high=self._init_scale)),
-                shape=[self._hidden_size, self._hidden_size],
-                dtype="float32",
-                default_initializer=fluid.initializer.UniformInitializer(
-                    low=-self._init_scale, high=self._init_scale))
-            self.weight_2_arr.append(self.add_parameter('w2_%d' % i, weight_2))
-            weight_3 = self.create_parameter(
-                attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
-                        low=-self._init_scale, high=self._init_scale)),
-                shape=[self._hidden_size, self._hidden_size],
-                dtype="float32",
-                default_initializer=fluid.initializer.UniformInitializer(
-                    low=-self._init_scale, high=self._init_scale))
-            self.weight_3_arr.append(self.add_parameter('w3_%d' % i, weight_3))
-            bias_1 = self.create_parameter(
-                attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
-                        low=-self._init_scale, high=self._init_scale)),
-                shape=[self._hidden_size * 2],
-                dtype="float32",
-                default_initializer=fluid.initializer.Constant(0.0))
-            self.bias_1_arr.append(self.add_parameter('b1_%d' % i, bias_1))
-            bias_2 = self.create_parameter(
-                attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
-                        low=-self._init_scale, high=self._init_scale)),
-                shape=[self._hidden_size * 1],
-                dtype="float32",
-                default_initializer=fluid.initializer.Constant(0.0))
-            self.bias_2_arr.append(self.add_parameter('b2_%d' % i, bias_2))
-
-    def forward(self, input_embedding, init_hidden=None):
-        hidden_array = []
-
-        for i in range(self._num_layers):
-            hidden_array.append(init_hidden[i])
-
-        res = []
-        for index in range(self._num_steps):
-            step_input = input_embedding[:, index, :]
-            for k in range(self._num_layers):
-                pre_hidden = hidden_array[k]
-                weight_1 = self.weight_1_arr[k]
-                weight_2 = self.weight_2_arr[k]
-                weight_3 = self.weight_3_arr[k]
-                bias_1 = self.bias_1_arr[k]
-                bias_2 = self.bias_2_arr[k]
-
-                nn = fluid.layers.concat([step_input, pre_hidden], 1)
-                gate_input = fluid.layers.matmul(x=nn, y=weight_1)
-                gate_input = fluid.layers.elementwise_add(gate_input, bias_1)
-                u, r = fluid.layers.split(gate_input, num_or_sections=2, dim=-1)
-                hidden_c = fluid.layers.tanh(
-                    fluid.layers.elementwise_add(
-                        fluid.layers.matmul(
-                            x=step_input, y=weight_2) + fluid.layers.matmul(
-                                x=(fluid.layers.sigmoid(r) * pre_hidden),
-                                y=weight_3),
-                        bias_2))
-                hidden_state = fluid.layers.sigmoid(u) * pre_hidden + (
-                    1.0 - fluid.layers.sigmoid(u)) * hidden_c
-                hidden_array[k] = hidden_state
-                step_input = hidden_state
-
-                if self._dropout is not None and self._dropout > 0.0:
-                    step_input = fluid.layers.dropout(
-                        step_input,
-                        dropout_prob=self._dropout,
-                        dropout_implementation='upscale_in_train')
-            res.append(step_input)
-        real_res = fluid.layers.concat(res, 1)
-        real_res = fluid.layers.reshape(
-            real_res, [-1, self._num_steps, self._hidden_size])
-        last_hidden = fluid.layers.concat(hidden_array, 1)
-        last_hidden = fluid.layers.reshape(
-            last_hidden, shape=[-1, self._num_layers, self._hidden_size])
-        last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2])
-        return real_res, last_hidden
-
-
-class PtbModel(fluid.Layer):
-    def __init__(self,
-                 name_scope,
-                 hidden_size,
-                 vocab_size,
-                 num_layers=2,
-                 num_steps=20,
-                 init_scale=0.1,
-                 dropout=None):
-        #super(PtbModel, self).__init__(name_scope)
-        super(PtbModel, self).__init__()
-        self.hidden_size = hidden_size
-        self.vocab_size = vocab_size
-        self.init_scale = init_scale
-        self.num_layers = num_layers
-        self.num_steps = num_steps
-        self.dropout = dropout
-        self.simple_gru_rnn = SimpleGRURNN(
-            #self.full_name(),
-            hidden_size,
-            num_steps,
-            num_layers=num_layers,
-            init_scale=init_scale,
-            dropout=dropout)
-        self.embedding = Embedding(
-            #self.full_name(),
-            size=[vocab_size, hidden_size],
-            dtype='float32',
-            is_sparse=False,
-            param_attr=fluid.ParamAttr(
-                name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
-                    low=-init_scale, high=init_scale)))
-        self.softmax_weight = self.create_parameter(
-            attr=fluid.ParamAttr(),
-            shape=[self.hidden_size, self.vocab_size],
-            dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
-                low=-self.init_scale, high=self.init_scale))
-        self.softmax_bias = self.create_parameter(
-            attr=fluid.ParamAttr(),
-            shape=[self.vocab_size],
-            dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
-                low=-self.init_scale, high=self.init_scale))
-
-    def build_once(self, input, label, init_hidden):
-        pass
-
-    def forward(self, input, label, init_hidden):
-
-        init_h = fluid.layers.reshape(
-            init_hidden, shape=[self.num_layers, -1, self.hidden_size])
-
-        x_emb = self.embedding(input)
-
-        x_emb = fluid.layers.reshape(
-            x_emb, shape=[-1, self.num_steps, self.hidden_size])
-        if self.dropout is not None and self.dropout > 0.0:
-            x_emb = fluid.layers.dropout(
-                x_emb,
-                dropout_prob=self.dropout,
-                dropout_implementation='upscale_in_train')
-        rnn_out, last_hidden = self.simple_gru_rnn(x_emb, init_h)
-
-        projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
-        projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=label, soft_label=False)
-        pre_2d = fluid.layers.reshape(projection, shape=[-1, self.vocab_size])
-        label_2d = fluid.layers.reshape(label, shape=[-1, 1])
-        acc = fluid.layers.accuracy(input=pre_2d, label=label_2d, k=20)
-        loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
-        loss = fluid.layers.reduce_mean(loss, dim=[0])
-        loss = fluid.layers.reduce_sum(loss)
-
-        return loss, last_hidden, acc
-
-    def debug_emb(self):
-
-        np.save("emb_grad", self.x_emb.gradient())
-
-
-def train_ptb_lm():
-    args = parse_args()
-
-    # check if set use_gpu=True in paddlepaddle cpu version
-    model_check.check_cuda(args.use_gpu)
-    # check if paddlepaddle version is satisfied
-    model_check.check_version()
-
-    model_type = args.model_type
-
-    vocab_size = 37484
-    if model_type == "gru4rec":
-        num_layers = 1
-        batch_size = 500
-        hidden_size = 100
-        num_steps = 10
-        init_scale = 0.1
-        max_grad_norm = 5.0
-        epoch_start_decay = 10
-        max_epoch = 5
-        dropout = 0.0
-        lr_decay = 0.5
-        base_learning_rate = 0.05
-    else:
-        print("model type not support")
-        return
-
-    with fluid.dygraph.guard(core.CUDAPlace(0)):
-        if args.ce:
-            print("ce mode")
-            seed = 33
-            np.random.seed(seed)
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            max_epoch = 1
-        ptb_model = PtbModel(
-            "ptb_model",
-            hidden_size=hidden_size,
-            vocab_size=vocab_size,
-            num_layers=num_layers,
-            num_steps=num_steps,
-            init_scale=init_scale,
-            dropout=dropout)
-
-        if args.init_from_pretrain_model:
-            if not os.path.exists(args.init_from_pretrain_model + '.pdparams'):
-                print(args.init_from_pretrain_model)
-                raise Warning("The pretrained params do not exist.")
-                return
-            fluid.load_dygraph(args.init_from_pretrain_model)
-            print("finish initing model from pretrained params from %s" %
-                  (args.init_from_pretrain_model))
-
-        dy_param_updated = dict()
-        dy_param_init = dict()
-        dy_loss = None
-        last_hidden = None
-
-        data_path = args.data_path
-        print("begin to load data")
-        ptb_data = reader.get_ptb_data(data_path)
-        print("finished load data")
-        train_data, valid_data, test_data = ptb_data
-
-        batch_len = len(train_data) // batch_size
-        total_batch_size = (batch_len - 1) // num_steps
-        print("total_batch_size:", total_batch_size)
-        log_interval = total_batch_size // 20
-
-        bd = []
-        lr_arr = [base_learning_rate]
-        for i in range(1, max_epoch):
-            bd.append(total_batch_size * i)
-            new_lr = base_learning_rate * (lr_decay**
-                                           max(i + 1 - epoch_start_decay, 0.0))
-            lr_arr.append(new_lr)
-
-        grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm)
-        sgd = AdagradOptimizer(
-            parameter_list=ptb_model.parameters(),
-            learning_rate=fluid.layers.piecewise_decay(
-                boundaries=bd, values=lr_arr),
-            grad_clip=grad_clip)
-
-        print("parameters:--------------------------------")
-        for para in ptb_model.parameters():
-            print(para.name)
-        print("parameters:--------------------------------")
-
-        def eval(model, data):
-            print("begion to eval")
-            total_loss = 0.0
-            iters = 0.0
-            init_hidden_data = np.zeros(
-                (num_layers, batch_size, hidden_size), dtype='float32')
-
-            model.eval()
-            train_data_iter = reader.get_data_iter(data, batch_size, num_steps)
-            init_hidden = to_variable(init_hidden_data)
-            accum_num_recall = 0.0
-            for batch_id, batch in enumerate(train_data_iter):
-                x_data, y_data = batch
-                x_data = x_data.reshape((-1, num_steps, 1))
-                y_data = y_data.reshape((-1, num_steps, 1))
-                x = to_variable(x_data)
-                y = to_variable(y_data)
-                dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden)
-
-                out_loss = dy_loss.numpy()
-                acc_ = acc.numpy()[0]
-                accum_num_recall += acc_
-                if batch_id % 1 == 0:
-                    print("batch_id:%d  recall@20:%.4f" %
-                          (batch_id, accum_num_recall / (batch_id + 1)))
-
-                init_hidden = last_hidden
-
-                total_loss += out_loss
-                iters += num_steps
-
-            print("eval finished")
-            ppl = np.exp(total_loss / iters)
-            print("recall@20 ", accum_num_recall / (batch_id + 1))
-            if args.ce:
-                print("kpis\ttest_ppl\t%0.3f" % ppl[0])
-
-        for epoch_id in range(max_epoch):
-            ptb_model.train()
-            total_loss = 0.0
-            iters = 0.0
-            init_hidden_data = np.zeros(
-                (num_layers, batch_size, hidden_size), dtype='float32')
-
-            train_data_iter = reader.get_data_iter(train_data, batch_size,
-                                                   num_steps)
-            init_hidden = to_variable(init_hidden_data)
-
-            start_time = time.time()
-            for batch_id, batch in enumerate(train_data_iter):
-                x_data, y_data = batch
-                x_data = x_data.reshape((-1, num_steps, 1))
-                y_data = y_data.reshape((-1, num_steps, 1))
-                x = to_variable(x_data)
-                y = to_variable(y_data)
-                dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden)
-
-                out_loss = dy_loss.numpy()
-                acc_ = acc.numpy()[0]
-
-                init_hidden = last_hidden.detach()
-                dy_loss.backward()
-                sgd.minimize(dy_loss)
-                ptb_model.clear_gradients()
-                total_loss += out_loss
-                iters += num_steps
-
-                if batch_id > 0 and batch_id % 100 == 1:
-                    ppl = np.exp(total_loss / iters)
-                    print(
-                        "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, acc: %.5f, lr: %.5f"
-                        % (epoch_id, batch_id, ppl[0], acc_,
-                           sgd._global_learning_rate().numpy()))
-
-            print("one ecpoh finished", epoch_id)
-            print("time cost ", time.time() - start_time)
-            ppl = np.exp(total_loss / iters)
-            print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0]))
-            if args.ce:
-                print("kpis\ttrain_ppl\t%0.3f" % ppl[0])
-            save_model_dir = os.path.join(args.save_model_dir,
-                                          str(epoch_id), 'params')
-            fluid.save_dygraph(ptb_model.state_dict(), save_model_dir)
-            print("Saved model to: %s.\n" % save_model_dir)
-            eval(ptb_model, test_data)
-
-        #eval(ptb_model, test_data)
-
-
-train_ptb_lm()
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import paddle
+import numpy as np
+import six
+
+import reader
+import model_check
+import time
+from args import *
+
+import sys
+if sys.version[0] == '2':
+    reload(sys)
+    sys.setdefaultencoding("utf-8")
+
+
+class SimpleGRURNN(paddle.fluid.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_steps,
+                 num_layers=2,
+                 init_scale=0.1,
+                 dropout=None):
+        super(SimpleGRURNN, self).__init__()
+        self._hidden_size = hidden_size
+        self._num_layers = num_layers
+        self._init_scale = init_scale
+        self._dropout = dropout
+        self._num_steps = num_steps
+
+        self.weight_1_arr = []
+        self.weight_2_arr = []
+        self.weight_3_arr = []
+        self.bias_1_arr = []
+        self.bias_2_arr = []
+        self.mask_array = []
+
+        for i in range(self._num_layers):
+            weight_1 = self.create_parameter(
+                attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(
+                    low=-self._init_scale, high=self._init_scale)),
+                shape=[self._hidden_size * 2, self._hidden_size * 2],
+                dtype="float32",
+                default_initializer=paddle.nn.initializer.Uniform(
+                    low=-self._init_scale, high=self._init_scale))
+            self.weight_1_arr.append(self.add_parameter('w1_%d' % i, weight_1))
+            weight_2 = self.create_parameter(
+                attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(
+                    low=-self._init_scale, high=self._init_scale)),
+                shape=[self._hidden_size, self._hidden_size],
+                dtype="float32",
+                default_initializer=paddle.nn.initializer.Uniform(
+                    low=-self._init_scale, high=self._init_scale))
+            self.weight_2_arr.append(self.add_parameter('w2_%d' % i, weight_2))
+            weight_3 = self.create_parameter(
+                attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(
+                    low=-self._init_scale, high=self._init_scale)),
+                shape=[self._hidden_size, self._hidden_size],
+                dtype="float32",
+                default_initializer=paddle.nn.initializer.Uniform(
+                    low=-self._init_scale, high=self._init_scale))
+            self.weight_3_arr.append(self.add_parameter('w3_%d' % i, weight_3))
+            bias_1 = self.create_parameter(
+                attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(
+                    low=-self._init_scale, high=self._init_scale)),
+                shape=[self._hidden_size * 2],
+                dtype="float32",
+                default_initializer=paddle.nn.initializer.Constant(0.0))
+            self.bias_1_arr.append(self.add_parameter('b1_%d' % i, bias_1))
+            bias_2 = self.create_parameter(
+                attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(
+                    low=-self._init_scale, high=self._init_scale)),
+                shape=[self._hidden_size * 1],
+                dtype="float32",
+                default_initializer=paddle.nn.initializer.Constant(0.0))
+            self.bias_2_arr.append(self.add_parameter('b2_%d' % i, bias_2))
+
+    def forward(self, input_embedding, init_hidden=None):
+        hidden_array = []
+
+        for i in range(self._num_layers):
+            hidden_array.append(init_hidden[i])
+
+        res = []
+        for index in range(self._num_steps):
+            step_input = input_embedding[:, index, :]
+            for k in range(self._num_layers):
+                pre_hidden = hidden_array[k]
+                weight_1 = self.weight_1_arr[k]
+                weight_2 = self.weight_2_arr[k]
+                weight_3 = self.weight_3_arr[k]
+                bias_1 = self.bias_1_arr[k]
+                bias_2 = self.bias_2_arr[k]
+
+                nn = paddle.concat(x=[step_input, pre_hidden], axis=1)
+                gate_input = paddle.matmul(x=nn, y=weight_1)
+                gate_input = paddle.add(x=gate_input, y=bias_1)
+                u, r = paddle.split(x=gate_input, num_or_sections=2, axis=-1)
+                hidden_c = paddle.tanh(
+                    paddle.add(x=paddle.matmul(
+                        x=step_input, y=weight_2) + paddle.matmul(
+                            x=(paddle.nn.functional.sigmoid(r) * pre_hidden),
+                            y=weight_3),
+                               y=bias_2))
+                hidden_state = paddle.nn.functional.sigmoid(u) * pre_hidden + (
+                    1.0 - paddle.nn.functional.sigmoid(u)) * hidden_c
+                hidden_array[k] = hidden_state
+                step_input = hidden_state
+
+                if self._dropout is not None and self._dropout > 0.0:
+                    step_input = paddle.fluid.layers.dropout(
+                        step_input,
+                        dropout_prob=self._dropout,
+                        dropout_implementation='upscale_in_train')
+            res.append(step_input)
+        real_res = paddle.concat(x=res, axis=1)
+        real_res = paddle.fluid.layers.reshape(
+            real_res, [-1, self._num_steps, self._hidden_size])
+        last_hidden = paddle.concat(x=hidden_array, axis=1)
+        last_hidden = paddle.fluid.layers.reshape(
+            last_hidden, shape=[-1, self._num_layers, self._hidden_size])
+        last_hidden = paddle.transpose(x=last_hidden, perm=[1, 0, 2])
+        return real_res, last_hidden
+
+
+class PtbModel(paddle.fluid.Layer):
+    def __init__(self,
+                 name_scope,
+                 hidden_size,
+                 vocab_size,
+                 num_layers=2,
+                 num_steps=20,
+                 init_scale=0.1,
+                 dropout=None):
+        #super(PtbModel, self).__init__(name_scope)
+        super(PtbModel, self).__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.init_scale = init_scale
+        self.num_layers = num_layers
+        self.num_steps = num_steps
+        self.dropout = dropout
+        self.simple_gru_rnn = SimpleGRURNN(
+            #self.full_name(),
+            hidden_size,
+            num_steps,
+            num_layers=num_layers,
+            init_scale=init_scale,
+            dropout=dropout)
+        self.embedding = paddle.fluid.dygraph.nn.Embedding(
+            #self.full_name(),
+            size=[vocab_size, hidden_size],
+            dtype='float32',
+            is_sparse=False,
+            param_attr=paddle.ParamAttr(
+                name='embedding_para',
+                initializer=paddle.nn.initializer.Uniform(
+                    low=-init_scale, high=init_scale)))
+        self.softmax_weight = self.create_parameter(
+            attr=paddle.ParamAttr(),
+            shape=[self.hidden_size, self.vocab_size],
+            dtype="float32",
+            default_initializer=paddle.nn.initializer.Uniform(
+                low=-self.init_scale, high=self.init_scale))
+        self.softmax_bias = self.create_parameter(
+            attr=paddle.ParamAttr(),
+            shape=[self.vocab_size],
+            dtype="float32",
+            default_initializer=paddle.nn.initializer.Uniform(
+                low=-self.init_scale, high=self.init_scale))
+
+    def build_once(self, input, label, init_hidden):
+        pass
+
+    def forward(self, input, label, init_hidden):
+
+        init_h = paddle.fluid.layers.reshape(
+            init_hidden, shape=[self.num_layers, -1, self.hidden_size])
+
+        x_emb = self.embedding(input)
+
+        x_emb = paddle.fluid.layers.reshape(
+            x_emb, shape=[-1, self.num_steps, self.hidden_size])
+        if self.dropout is not None and self.dropout > 0.0:
+            x_emb = paddle.fluid.layers.dropout(
+                x_emb,
+                dropout_prob=self.dropout,
+                dropout_implementation='upscale_in_train')
+        rnn_out, last_hidden = self.simple_gru_rnn(x_emb, init_h)
+
+        projection = paddle.matmul(x=rnn_out, y=self.softmax_weight)
+        projection = paddle.add(x=projection, y=self.softmax_bias)
+        loss = paddle.nn.functional.softmax_with_cross_entropy(
+            logits=projection, label=label, soft_label=False)
+        pre_2d = paddle.fluid.layers.reshape(
+            projection, shape=[-1, self.vocab_size])
+        label_2d = paddle.fluid.layers.reshape(label, shape=[-1, 1])
+        acc = paddle.metric.accuracy(input=pre_2d, label=label_2d, k=20)
+        loss = paddle.fluid.layers.reshape(loss, shape=[-1, self.num_steps])
+        loss = paddle.reduce_mean(loss, dim=[0])
+        loss = paddle.reduce_sum(loss)
+
+        return loss, last_hidden, acc
+
+    def debug_emb(self):
+
+        np.save("emb_grad", self.x_emb.gradient())
+
+
+def train_ptb_lm():
+    args = parse_args()
+
+    # check if set use_gpu=True in paddlepaddle cpu version
+    model_check.check_cuda(args.use_gpu)
+    # check if paddlepaddle version is satisfied
+    model_check.check_version()
+
+    model_type = args.model_type
+
+    vocab_size = 37484
+    if model_type == "gru4rec":
+        num_layers = 1
+        batch_size = 500
+        hidden_size = 100
+        num_steps = 10
+        init_scale = 0.1
+        max_grad_norm = 5.0
+        epoch_start_decay = 10
+        max_epoch = 5
+        dropout = 0.0
+        lr_decay = 0.5
+        base_learning_rate = 0.05
+    else:
+        print("model type not support")
+        return
+
+    paddle.disable_static(paddle.fluid.core.CUDAPlace(0))
+    if args.ce:
+        print("ce mode")
+        seed = 33
+        np.random.seed(seed)
+        paddle.static.default_startup_program().random_seed = seed
+        paddle.static.default_main_program().random_seed = seed
+        max_epoch = 1
+    ptb_model = PtbModel(
+        "ptb_model",
+        hidden_size=hidden_size,
+        vocab_size=vocab_size,
+        num_layers=num_layers,
+        num_steps=num_steps,
+        init_scale=init_scale,
+        dropout=dropout)
+
+    if args.init_from_pretrain_model:
+        if not os.path.exists(args.init_from_pretrain_model + '.pdparams'):
+            print(args.init_from_pretrain_model)
+            raise Warning("The pretrained params do not exist.")
+            return
+        paddle.fluid.load_dygraph(args.init_from_pretrain_model)
+        print("finish initing model from pretrained params from %s" %
+              (args.init_from_pretrain_model))
+
+    dy_param_updated = dict()
+    dy_param_init = dict()
+    dy_loss = None
+    last_hidden = None
+
+    data_path = args.data_path
+    print("begin to load data")
+    ptb_data = reader.get_ptb_data(data_path)
+    print("finished load data")
+    train_data, valid_data, test_data = ptb_data
+
+    batch_len = len(train_data) // batch_size
+    total_batch_size = (batch_len - 1) // num_steps
+    print("total_batch_size:", total_batch_size)
+    log_interval = total_batch_size // 20
+
+    bd = []
+    lr_arr = [base_learning_rate]
+    for i in range(1, max_epoch):
+        bd.append(total_batch_size * i)
+        new_lr = base_learning_rate * (lr_decay
+                                       **max(i + 1 - epoch_start_decay, 0.0))
+        lr_arr.append(new_lr)
+
+    grad_clip = paddle.nn.ClipGradByGlobalNorm(max_grad_norm)
+    sgd = paddle.optimizer.Adagrad(
+        parameters=ptb_model.parameters(),
+        learning_rate=base_learning_rate,
+        #learning_rate=paddle.fluid.layers.piecewise_decay(
+        #    boundaries=bd, values=lr_arr),
+        grad_clip=grad_clip)
+
+    print("parameters:--------------------------------")
+    for para in ptb_model.parameters():
+        print(para.name)
+    print("parameters:--------------------------------")
+
+    def eval(model, data):
+        print("begion to eval")
+        total_loss = 0.0
+        iters = 0.0
+        init_hidden_data = np.zeros(
+            (num_layers, batch_size, hidden_size), dtype='float32')
+
+        model.eval()
+        train_data_iter = reader.get_data_iter(data, batch_size, num_steps)
+        init_hidden = paddle.to_tensor(
+            data=init_hidden_data, dtype=None, place=None, stop_gradient=True)
+        accum_num_recall = 0.0
+        for batch_id, batch in enumerate(train_data_iter):
+            x_data, y_data = batch
+            x_data = x_data.reshape((-1, num_steps, 1))
+            y_data = y_data.reshape((-1, num_steps, 1))
+            x = paddle.to_tensor(
+                data=x_data, dtype=None, place=None, stop_gradient=True)
+            y = paddle.to_tensor(
+                data=y_data, dtype=None, place=None, stop_gradient=True)
+            dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden)
+
+            out_loss = dy_loss.numpy()
+            acc_ = acc.numpy()[0]
+            accum_num_recall += acc_
+            if batch_id % 1 == 0:
+                print("batch_id:%d  recall@20:%.4f" %
+                      (batch_id, accum_num_recall / (batch_id + 1)))
+
+            init_hidden = last_hidden
+
+            total_loss += out_loss
+            iters += num_steps
+
+        print("eval finished")
+        ppl = np.exp(total_loss / iters)
+        print("recall@20 ", accum_num_recall / (batch_id + 1))
+        if args.ce:
+            print("kpis\ttest_ppl\t%0.3f" % ppl[0])
+
+    for epoch_id in range(max_epoch):
+        ptb_model.train()
+        total_loss = 0.0
+        iters = 0.0
+        init_hidden_data = np.zeros(
+            (num_layers, batch_size, hidden_size), dtype='float32')
+
+        train_data_iter = reader.get_data_iter(train_data, batch_size,
+                                               num_steps)
+        init_hidden = paddle.to_tensor(
+            data=init_hidden_data, dtype=None, place=None, stop_gradient=True)
+
+        start_time = time.time()
+        for batch_id, batch in enumerate(train_data_iter):
+            x_data, y_data = batch
+            x_data = x_data.reshape((-1, num_steps, 1))
+            y_data = y_data.reshape((-1, num_steps, 1))
+            x = paddle.to_tensor(
+                data=x_data, dtype=None, place=None, stop_gradient=True)
+            y = paddle.to_tensor(
+                data=y_data, dtype=None, place=None, stop_gradient=True)
+            dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden)
+
+            out_loss = dy_loss.numpy()
+            acc_ = acc.numpy()[0]
+
+            init_hidden = last_hidden.detach()
+            dy_loss.backward()
+            sgd.minimize(dy_loss)
+            ptb_model.clear_gradients()
+            total_loss += out_loss
+            iters += num_steps
+
+            if batch_id > 0 and batch_id % 100 == 1:
+                ppl = np.exp(total_loss / iters)
+                print(
+                    "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, acc: %.5f, lr: %.5f"
+                    % (epoch_id, batch_id, ppl[0], acc_,
+                       sgd._global_learning_rate().numpy()))
+
+        print("one ecpoh finished", epoch_id)
+        print("time cost ", time.time() - start_time)
+        ppl = np.exp(total_loss / iters)
+        print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0]))
+        if args.ce:
+            print("kpis\ttrain_ppl\t%0.3f" % ppl[0])
+        save_model_dir = os.path.join(args.save_model_dir,
+                                      str(epoch_id), 'params')
+        paddle.fluid.save_dygraph(ptb_model.state_dict(), save_model_dir)
+        print("Saved model to: %s.\n" % save_model_dir)
+        eval(ptb_model, test_data)
+    paddle.enable_static()
+
+    #eval(ptb_model, test_data)
+
+
+train_ptb_lm()