add cloud

2d392828 · Yao Cheng · chengyao · 8b873b29 · 2d392828 · 2d392828
2 changed file
--- a/fluid/text_classification/clouds/scdb_parallel_executor.py
+++ b/fluid/text_classification/clouds/scdb_parallel_executor.py
+import unittest
+import contextlib
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import numpy as np
+import sys
+import time
+import os
+import json
+import random
+
+def to_lodtensor(data, place):
+    """
+    convert to LODtensor
+    """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def load_vocab(filename):
+    """
+    load imdb vocabulary
+    """
+    vocab = {}
+    with open(filename) as f:
+        wid = 0
+        for line in f:
+            vocab[line.strip()] = wid
+            wid += 1
+    vocab["<unk>"] = len(vocab)
+    return vocab
+
+
+def data2tensor(data, place):
+    """
+    data2tensor
+    """
+    input_seq = to_lodtensor(map(lambda x:x[0], data), place)
+    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+    y_data = y_data.reshape([-1, 1])
+    return {"words": input_seq, "label": y_data}
+
+def data2pred(data, place):
+    """
+    data2tensor
+    """
+    input_seq = to_lodtensor(map(lambda x:x[0], data), place)
+    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+    y_data = y_data.reshape([-1, 1])
+    return {"words": input_seq}
+
+def load_dict(vocab):
+    """
+    Load dict from vocab
+    """
+    word_dict = dict()
+    with open(vocab, "r") as fin:
+        for line in fin:
+            cols = line.strip("\r\n").decode("gb18030").split("\t")
+            word_dict[cols[0]] = int(cols[1])
+    return word_dict
+
+
+def save_dict(word_dict, vocab):
+    """
+    Save dict into file
+    """
+    with open(vocab, "w") as fout:
+        for k, v in word_dict.iteritems():
+            outstr = ("%s\t%s\n" % (k, v)).encode("gb18030")
+            fout.write(outstr)
+
+def build_dict(fname):
+    """
+    build word dict using trainset
+    """
+    word_dict = dict()
+    with open(fname, "r") as fin:
+        for line in fin:
+            try:
+                words = line.strip("\r\n").decode("gb18030").split("\t")[1].split(" ")
+            except:
+                sys.stderr.write("[warning] build_dict: decode error\n")
+                continue
+            for w in words:
+                if w not in word_dict:
+                    word_dict[w] = len(word_dict)
+    return word_dict
+
+
+def scdb_word_dict(vocab="scdb_data/train_set/train.vocab"):
+    """
+    get word_dict
+    """
+    if not os.path.exists(vocab):
+        w_dict = build_dict(train_file)
+        save_dict(w_dict, vocab)
+    else:
+        w_dict = load_dict(vocab)
+    w_dict["<unk>"] = len(w_dict)
+    return w_dict
+
+
+def data_reader(fname, word_dict, is_dir=False):
+    """
+    Convert word sequence into slot
+    """
+    unk_id = len(word_dict)
+    all_data = []
+    filelist = []
+    if is_dir:
+        filelist = [fname + os.sep + f for f in os.listdir(fname)]
+    else:
+        filelist = [fname]
+
+    for each_name in filelist:
+        with open(each_name, "r") as fin:
+            for line in fin:
+                try:
+                    cols = line.strip("\r\n").decode("gb18030").split("\t")
+                except:
+                    sys.stderr.write("warning: ignore decode error\n")
+                    continue
+
+                label = int(cols[0])
+                wids = [word_dict[x] if x in word_dict else unk_id for x in cols[1].split(" ")]
+                all_data.append((wids, label))
+
+    random.shuffle(all_data)
+
+    def reader():
+        for doc, label in all_data:
+            yield doc, label
+    
+    return reader
+
+
+def scdb_train_data(train_dir="scdb_data/train_set/corpus.train.seg", w_dict=None):
+    """
+    create train data
+    """
+    return data_reader(train_dir, w_dict, True)
+
+
+def scdb_test_data(test_file, w_dict):
+    """
+    test_set=["car", "lbs", "spot", "weibo", 
+            "baby", "toutiao", "3c", "movie", "haogan"]
+    """
+    return data_reader(test_file, w_dict)
+
+
+def bow_net(data, label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    bow net
+    """
+    emb = fluid.layers.embedding(input=data, 
+                                size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(
+        input=emb,
+        pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh,
+                        size=hid_dim, act = "tanh")
+    fc_2 = fluid.layers.fc(input=fc_1,
+                        size=hid_dim2, act = "tanh")
+    prediction = fluid.layers.fc(input=[fc_2],
+                             size=class_dim,
+                             act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    
+    return avg_cost, acc, prediction
+
+
+def cnn_net(data, label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            win_size=3):
+    """
+    conv net
+    """
+    emb = fluid.layers.embedding(input=data, 
+                                size=[dict_dim, emb_dim])
+
+    conv_3 = fluid.nets.sequence_conv_pool(input=emb,
+                                    num_filters=hid_dim,
+                                    filter_size=win_size,
+                                    act="tanh",
+                                    pool_type="max")
+
+    fc_1 = fluid.layers.fc(input=[conv_3],
+                                    size=hid_dim2)
+
+    prediction = fluid.layers.fc(input=[fc_1],
+                             size=class_dim,
+                             act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    
+    return avg_cost, acc, prediction
+
+
+def lstm_net(data, label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            emb_lr=30.0):
+    """
+    lstm net
+    """
+    emb = fluid.layers.embedding(input=data, 
+                                size=[dict_dim, emb_dim],
+                                param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    fc0 = fluid.layers.fc(input=emb, 
+                        size=hid_dim * 4, 
+                        act='tanh')
+
+    lstm_h, c = fluid.layers.dynamic_lstm(input=fc0, 
+                        size=hid_dim * 4, 
+                        is_reverse=False)
+
+    lstm_max = fluid.layers.sequence_pool(input=lstm_h, 
+                        pool_type='max')
+    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+
+    fc1 = fluid.layers.fc(input=lstm_max_tanh, 
+                        size=hid_dim2, 
+                        act='tanh')
+
+    prediction = fluid.layers.fc(input=fc1, 
+                        size=class_dim, 
+                        act='softmax')
+
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    
+    return avg_cost, acc, prediction
+
+
+def bilstm_net(data, label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            emb_lr=30.0):
+    """
+    lstm net
+    """
+    emb = fluid.layers.embedding(input=data, 
+                                size=[dict_dim, emb_dim],
+                                param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    fc0 = fluid.layers.fc(input=emb, 
+                        size=hid_dim * 4, 
+                        act='tanh')
+
+    rfc0 = fluid.layers.fc(input=emb, 
+                        size=hid_dim * 4, 
+                        act='tanh')
+
+    lstm_h, c = fluid.layers.dynamic_lstm(input=fc0, 
+                        size=hid_dim * 4, 
+                        is_reverse=False)
+    
+    rlstm_h, c = fluid.layers.dynamic_lstm(input=rfc0, 
+                        size=hid_dim * 4, 
+                        is_reverse=True)
+
+    lstm_last = fluid.layers.sequence_last_step(input=lstm_h)
+    rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h)
+    
+    lstm_last_tanh = fluid.layers.tanh(lstm_last)
+    rlstm_last_tanh = fluid.layers.tanh(rlstm_last)
+
+    lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1)
+
+    fc1 = fluid.layers.fc(input=lstm_concat,
+                        size=hid_dim2, 
+                        act='tanh')
+
+    prediction = fluid.layers.fc(input=fc1, 
+                        size=class_dim, 
+                        act='softmax')
+
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    
+    return avg_cost, acc, prediction
+
+
+def gru_net(data, label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            emb_lr=30.0):
+    """
+    gru net
+    """
+    emb = fluid.layers.embedding(input=data, 
+                                size=[dict_dim, emb_dim],
+                                param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    fc0 = fluid.layers.fc(input=emb, 
+                        size=hid_dim * 3)
+
+    gru_h = fluid.layers.dynamic_gru(input=fc0, 
+                        size=hid_dim, 
+                        is_reverse=False)
+
+    gru_max = fluid.layers.sequence_pool(input=gru_h, 
+                        pool_type='max')
+    gru_max_tanh = fluid.layers.tanh(gru_max)
+
+    fc1 = fluid.layers.fc(input=gru_max_tanh, 
+                        size=hid_dim2, 
+                        act='tanh')
+
+    prediction = fluid.layers.fc(input=fc1, 
+                        size=class_dim, 
+                        act='softmax')
+
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    
+    return avg_cost, acc, prediction
+
+def infer(test_reader,
+          use_cuda,
+          model_path=None):
+    """
+    inference function
+    """
+    if model_path is None:
+        print(str(model_path) + " cannot be found")
+        return
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        [inference_program, feed_target_names,
+        fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+
+        class2_list, class3_list = [], []
+        for each_test_reader in test_reader:
+            class2_acc, class3_acc = 0.0, 0.0
+            total_count, neu_count = 0, 0
+        
+            for data in each_test_reader():
+                pred = exe.run(inference_program,
+                        feed = data2pred(data, place),
+                        fetch_list=fetch_targets,
+                        return_numpy=True)
+            
+                for i, val in enumerate(data):
+                    pos_score = pred[0][i, 1]
+                    true_label = val[1]
+                    if true_label == 2.0 and pos_score > 0.5:
+                        class2_acc += 1
+                    if true_label == 0.0 and pos_score < 0.5:
+                        class2_acc += 1
+
+                    if true_label == 2.0 and pos_score > 0.55:
+                        class3_acc += 1
+                    if true_label == 1.0 and pos_score > 0.45 and pos_score <= 0.55:
+                        class3_acc += 1
+                    if true_label == 0.0 and pos_score <= 0.45:
+                        class3_acc += 1
+
+                    if true_label == 1.0:
+                        neu_count += 1
+
+                total_count += len(data)
+        
+            class2_acc = class2_acc / (total_count - neu_count)
+            class3_acc = class3_acc / total_count
+            class2_list.append(class2_acc)
+            class3_list.append(class3_acc)
+
+        class2_acc = sum(class2_list) / len(class2_list)
+        class3_acc = sum(class3_list) / len(class3_list)
+        print("[test info] model_path: %s, class2_acc: %f, class3_acc: %f" % (model_path, class2_acc, class3_acc))
+
+
+def start_train(train_reader,
+        test_reader,
+        word_dict,
+        network,
+        use_cuda,
+        parallel,
+        save_dirname,
+        lr=0.2,
+        batch_size=128,
+        pass_num=30):
+    """
+    train network
+    """
+    data = fluid.layers.data(
+        name="words", 
+        shape=[1], 
+        dtype="int64", 
+        lod_level=1)
+
+    label = fluid.layers.data(
+        name="label", 
+        shape=[1], 
+        dtype="int64")
+
+    cost, acc, pred = network(
+         data, label, len(word_dict) + 1)
+
+    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=lr)
+    sgd_optimizer.minimize(cost)
+
+    place = fluid.CPUPlace()
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+    
+    start_exe = fluid.Executor(place)
+    start_exe.run(fluid.default_startup_program())
+    
+    exe = fluid.ParallelExecutor(use_cuda, loss_name=cost.name)
+    for pass_id in xrange(pass_num):
+        total_acc, total_cost, total_count, avg_cost, avg_acc = 0.0, 0.0, 0.0, 0.0, 0.0
+        for data in train_reader():
+            cost_val, acc_val = exe.run(feed=feeder.feed(data),
+                                        fetch_list=[cost.name, acc.name])
+            cost_val_list, acc_val_list = np.array(cost_val), np.array(acc_val)
+            total_cost += cost_val_list.sum() * len(data)
+            total_acc += acc_val_list.sum() * len(data)
+            total_count += len(data)
+            
+        avg_cost = total_cost / total_count
+        avg_acc = total_acc / total_count
+        print("[train info]: pass_id: %d, avg_acc: %f, avg_cost: %f" % (pass_id, avg_acc, avg_cost))
+        
+        gpu_place = fluid.CUDAPlace(0)
+        save_exe = fluid.Executor(gpu_place)
+        epoch_model = save_dirname + "/" + "epoch" + str(pass_id)
+        fluid.io.save_inference_model(
+                epoch_model, 
+                ["words"],
+                pred, save_exe)
+        infer(test_reader, False, epoch_model)
+
+
+def train_net(vocab="./thirdparty/train.vocab",
+            train_dir="./train",
+            test_list=["car", "spot", "weibo", "lbs"]):
+    """
+    w_dict = scdb_word_dict(vocab=vocab)
+    test_files = [ "./thirdparty" + os.sep + f for f in test_list]
+    
+    train_reader = paddle.batch(
+                        scdb_train_data(train_dir, w_dict),
+                        batch_size = 256)
+
+    test_reader = [paddle.batch(scdb_test_data(test_file, w_dict), batch_size = 50) \
+            for test_file in test_files]
+    """
+    w_dict = paddle.dataset.imdb.word_dict()
+    print("dict ready")
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(w_dict), buf_size=50000),
+        batch_size=128)
+
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+        paddle.dataset.imdb.test(w_dict), buf_size=50000),
+        batch_size=128)
+    test_reader = [test_reader]
+    start_train(train_reader, test_reader, w_dict, bilstm_net, use_cuda=True,
+                parallel=False, save_dirname="scdb_bilstm_model", lr=0.05,
+                pass_num=10, batch_size=256)
+
+
+if __name__ == "__main__":
+    train_net()
--- a/fluid/text_classification/clouds/scdb_single_card.py
+++ b/fluid/text_classification/clouds/scdb_single_card.py
+import unittest
+import contextlib
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import numpy as np
+import sys
+import time
+import os
+import json
+import random
+
+def to_lodtensor(data, place):
+    """
+    convert to LODtensor
+    """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def load_vocab(filename):
+    """
+    load imdb vocabulary
+    """
+    vocab = {}
+    with open(filename) as f:
+        wid = 0
+        for line in f:
+            vocab[line.strip()] = wid
+            wid += 1
+    vocab["<unk>"] = len(vocab)
+    return vocab
+
+
+def data2tensor(data, place):
+    """
+    data2tensor
+    """
+    input_seq = to_lodtensor(map(lambda x:x[0], data), place)
+    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+    y_data = y_data.reshape([-1, 1])
+    return {"words": input_seq, "label": y_data}
+
+def data2pred(data, place):
+    """
+    data2tensor
+    """
+    input_seq = to_lodtensor(map(lambda x:x[0], data), place)
+    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+    y_data = y_data.reshape([-1, 1])
+    return {"words": input_seq}
+
+def load_dict(vocab):
+    """
+    Load dict from vocab
+    """
+    word_dict = dict()
+    with open(vocab, "r") as fin:
+        for line in fin:
+            cols = line.strip("\r\n").decode("gb18030").split("\t")
+            word_dict[cols[0]] = int(cols[1])
+    return word_dict
+
+
+def save_dict(word_dict, vocab):
+    """
+    Save dict into file
+    """
+    with open(vocab, "w") as fout:
+        for k, v in word_dict.iteritems():
+            outstr = ("%s\t%s\n" % (k, v)).encode("gb18030")
+            fout.write(outstr)
+
+def build_dict(fname):
+    """
+    build word dict using trainset
+    """
+    word_dict = dict()
+    with open(fname, "r") as fin:
+        for line in fin:
+            try:
+                words = line.strip("\r\n").decode("gb18030").split("\t")[1].split(" ")
+            except:
+                sys.stderr.write("[warning] build_dict: decode error\n")
+                continue
+            for w in words:
+                if w not in word_dict:
+                    word_dict[w] = len(word_dict)
+    return word_dict
+
+
+def scdb_word_dict(vocab="scdb_data/train_set/train.vocab"):
+    """
+    get word_dict
+    """
+    if not os.path.exists(vocab):
+        w_dict = build_dict(train_file)
+        save_dict(w_dict, vocab)
+    else:
+        w_dict = load_dict(vocab)
+    w_dict["<unk>"] = len(w_dict)
+    return w_dict
+
+
+def data_reader(fname, word_dict, is_dir=False):
+    """
+    Convert word sequence into slot
+    """
+    unk_id = len(word_dict)
+    all_data = []
+    filelist = []
+    if is_dir:
+        filelist = [fname + os.sep + f for f in os.listdir(fname)]
+    else:
+        filelist = [fname]
+
+    for each_name in filelist:
+        with open(each_name, "r") as fin:
+            for line in fin:
+                try:
+                    cols = line.strip("\r\n").decode("gb18030").split("\t")
+                except:
+                    sys.stderr.write("warning: ignore decode error\n")
+                    continue
+
+                label = int(cols[0])
+                wids = [word_dict[x] if x in word_dict else unk_id for x in cols[1].split(" ")]
+                all_data.append((wids, label))
+
+    random.shuffle(all_data)
+
+    def reader():
+        for doc, label in all_data:
+            yield doc, label
+    
+    return reader
+
+
+def scdb_train_data(train_dir="scdb_data/train_set/corpus.train.seg", w_dict=None):
+    """
+    create train data
+    """
+    return data_reader(train_dir, w_dict, True)
+
+
+def scdb_test_data(test_file, w_dict):
+    """
+    test_set=["car", "lbs", "spot", "weibo", 
+            "baby", "toutiao", "3c", "movie", "haogan"]
+    """
+    return data_reader(test_file, w_dict)
+
+
+def bow_net(data, label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    bow net
+    """
+    emb = fluid.layers.embedding(input=data, 
+                                size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(
+        input=emb,
+        pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh,
+                        size=hid_dim, act = "tanh")
+    fc_2 = fluid.layers.fc(input=fc_1,
+                        size=hid_dim2, act = "tanh")
+    prediction = fluid.layers.fc(input=[fc_2],
+                             size=class_dim,
+                             act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    
+    return avg_cost, acc, prediction
+
+
+def cnn_net(data, label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            win_size=3):
+    """
+    conv net
+    """
+    emb = fluid.layers.embedding(input=data, 
+                                size=[dict_dim, emb_dim])
+
+    conv_3 = fluid.nets.sequence_conv_pool(input=emb,
+                                    num_filters=hid_dim,
+                                    filter_size=win_size,
+                                    act="tanh",
+                                    pool_type="max")
+
+    fc_1 = fluid.layers.fc(input=[conv_3],
+                                    size=hid_dim2)
+
+    prediction = fluid.layers.fc(input=[fc_1],
+                             size=class_dim,
+                             act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    
+    return avg_cost, acc, prediction
+
+
+def lstm_net(data, label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            emb_lr=30.0):
+    """
+    lstm net
+    """
+    emb = fluid.layers.embedding(input=data, 
+                                size=[dict_dim, emb_dim],
+                                param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    fc0 = fluid.layers.fc(input=emb, 
+                        size=hid_dim * 4, 
+                        act='tanh')
+
+    lstm_h, c = fluid.layers.dynamic_lstm(input=fc0, 
+                        size=hid_dim * 4, 
+                        is_reverse=False)
+
+    lstm_max = fluid.layers.sequence_pool(input=lstm_h, 
+                        pool_type='max')
+    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+
+    fc1 = fluid.layers.fc(input=lstm_max_tanh, 
+                        size=hid_dim2, 
+                        act='tanh')
+
+    prediction = fluid.layers.fc(input=fc1, 
+                        size=class_dim, 
+                        act='softmax')
+
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    
+    return avg_cost, acc, prediction
+
+
+def bilstm_net(data, label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            emb_lr=30.0):
+    """
+    lstm net
+    """
+    emb = fluid.layers.embedding(input=data, 
+                                size=[dict_dim, emb_dim],
+                                param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    fc0 = fluid.layers.fc(input=emb, 
+                        size=hid_dim * 4, 
+                        act='tanh')
+
+    rfc0 = fluid.layers.fc(input=emb, 
+                        size=hid_dim * 4, 
+                        act='tanh')
+
+    lstm_h, c = fluid.layers.dynamic_lstm(input=fc0, 
+                        size=hid_dim * 4, 
+                        is_reverse=False)
+    
+    rlstm_h, c = fluid.layers.dynamic_lstm(input=rfc0, 
+                        size=hid_dim * 4, 
+                        is_reverse=True)
+
+    lstm_last = fluid.layers.sequence_last_step(input=lstm_h)
+    rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h)
+    
+    lstm_last_tanh = fluid.layers.tanh(lstm_last)
+    rlstm_last_tanh = fluid.layers.tanh(rlstm_last)
+
+    lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1)
+
+    fc1 = fluid.layers.fc(input=lstm_concat,
+                        size=hid_dim2, 
+                        act='tanh')
+
+    prediction = fluid.layers.fc(input=fc1, 
+                        size=class_dim, 
+                        act='softmax')
+
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    
+    return avg_cost, acc, prediction
+
+
+def gru_net(data, label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            emb_lr=30.0):
+    """
+    gru net
+    """
+    emb = fluid.layers.embedding(input=data, 
+                                size=[dict_dim, emb_dim],
+                                param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    fc0 = fluid.layers.fc(input=emb, 
+                        size=hid_dim * 3)
+
+    gru_h = fluid.layers.dynamic_gru(input=fc0, 
+                        size=hid_dim, 
+                        is_reverse=False)
+
+    gru_max = fluid.layers.sequence_pool(input=gru_h, 
+                        pool_type='max')
+    gru_max_tanh = fluid.layers.tanh(gru_max)
+
+    fc1 = fluid.layers.fc(input=gru_max_tanh, 
+                        size=hid_dim2, 
+                        act='tanh')
+
+    prediction = fluid.layers.fc(input=fc1, 
+                        size=class_dim, 
+                        act='softmax')
+
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    
+    return avg_cost, acc, prediction
+
+
+def infer(test_reader,
+          use_cuda,
+          model_path=None):
+    """
+    inference function
+    """
+    if model_path is None:
+        print(str(model_path) + " cannot be found")
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        [inference_program, feed_target_names,
+        fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+
+        class2_list, class3_list = [], []
+        for each_test_reader in test_reader:
+            class2_acc, class3_acc = 0.0, 0.0
+            total_count, neu_count = 0, 0
+        
+            for data in each_test_reader():
+                pred = exe.run(inference_program,
+                        feed = data2pred(data, place),
+                        fetch_list=fetch_targets,
+                        return_numpy=True)
+            
+                for i, val in enumerate(data):
+                    pos_score = pred[0][i, 1]
+                    true_label = val[1]
+                    if true_label == 2.0 and pos_score > 0.5:
+                        class2_acc += 1
+                    if true_label == 0.0 and pos_score < 0.5:
+                        class2_acc += 1
+
+                    if true_label == 2.0 and pos_score > 0.55:
+                        class3_acc += 1
+                    if true_label == 1.0 and pos_score > 0.45 and pos_score <= 0.55:
+                        class3_acc += 1
+                    if true_label == 0.0 and pos_score <= 0.45:
+                        class3_acc += 1
+
+                    if true_label == 1.0:
+                        neu_count += 1
+
+                total_count += len(data)
+        
+            class2_acc = class2_acc / (total_count - neu_count)
+            class3_acc = class3_acc / total_count
+            class2_list.append(class2_acc)
+            class3_list.append(class3_acc)
+
+        class2_acc = sum(class2_list) / len(class2_list)
+        class3_acc = sum(class3_list) / len(class3_list)
+        print("[test info] model_path: %s, class2_acc: %f, class3_acc: %f" % (model_path, class2_acc, class3_acc))
+
+
+def start_train(train_reader,
+        test_reader,
+        word_dict,
+        network,
+        use_cuda,
+        parallel,
+        save_dirname,
+        lr=0.2,
+        batch_size=128,
+        pass_num=30):
+    """
+    train network
+    """
+    data = fluid.layers.data(
+        name="words", 
+        shape=[1], 
+        dtype="int64", 
+        lod_level=1)
+
+    label = fluid.layers.data(
+        name="label", 
+        shape=[1], 
+        dtype="int64")
+
+    cost, acc, pred = network(
+         data, label, len(word_dict) + 1)
+
+    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=lr)
+    sgd_optimizer.minimize(cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+    exe.run(fluid.default_startup_program())
+    for pass_id in xrange(pass_num):
+        data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
+        for data in train_reader():
+            avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(),
+                                        feed=feeder.feed(data),
+                                        fetch_list=[cost, acc])
+            data_size = len(data)
+            total_acc += data_size * avg_acc_np
+            total_cost += data_size * avg_cost_np
+            data_count += data_size
+        
+        avg_cost = total_cost / data_count
+        avg_acc = total_acc / data_count
+        print("[train info]: pass_id: %d, avg_acc: %f, avg_cost: %f" % (pass_id, avg_acc, avg_cost))
+        
+        epoch_model = save_dirname + "/" + "epoch" + str(pass_id)
+        fluid.io.save_inference_model(
+                epoch_model, 
+                ["words"],
+                pred, exe)
+        infer(test_reader, False, epoch_model)
+
+
+def train_net(vocab="./thirdparty/train.vocab",
+            train_dir="./train",
+            test_list=["car", "spot", "weibo", "lbs"]):
+    w_dict = scdb_word_dict(vocab=vocab)
+    test_files = [ "./thirdparty" + os.sep + f for f in test_list]
+    
+    train_reader = paddle.batch(
+                        scdb_train_data(train_dir, w_dict),
+                        batch_size = 256)
+
+    test_reader = [paddle.batch(scdb_test_data(test_file, w_dict), batch_size = 50) \
+            for test_file in test_files]
+    
+    start_train(train_reader, test_reader, w_dict, bow_net, use_cuda=False,
+                parallel=False, save_dirname="scdb_bow_model", lr=0.002,
+                pass_num=10, batch_size=256)
+
+
+if __name__ == "__main__":
+    train_net()