add style.yapf

83aba534 · Zeyu Chen · 1a316d55 · 83aba534 · 83aba534 · 83aba534
8 changed file
--- a/Senta/data/test_data/corpus.test
+++ b/Senta/data/test_data/corpus.test
--- a/Senta/data/train.vocab
+++ b/Senta/data/train.vocab
--- a/Senta/data/train_data/corpus.train
+++ b/Senta/data/train_data/corpus.train
--- a/Senta/nets.py
+++ b/Senta/nets.py
+import sys
+import time
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle
+
+
+def bow_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    Bow net
+    """
+    # embedding layer
+    emb = fluid.layers.embedding(
+        input=data, size=[dict_dim, emb_dim], param_attr="bow_embedding")
+    # bow layer
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    # full connect layer
+    fc_1 = fluid.layers.fc(
+        input=bow_tanh, size=hid_dim, act="tanh", name="bow_fc1")
+    fc_2 = fluid.layers.fc(
+        input=fc_1, size=hid_dim2, act="tanh", name="bow_fc2")
+    # softmax layer
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax",
+            name="fc_softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction, emb
+
+
+def cnn_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            win_size=3):
+    """
+    Conv net
+    """
+    # embedding layer
+    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
+
+    # convolution layer
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=win_size,
+        act="tanh",
+        pool_type="max")
+
+    # full connect layer
+    fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
+    # softmax layer
+    prediction = fluid.layers.fc(input=[fc_1], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction
+
+
+def lstm_net(data,
+             label,
+             dict_dim,
+             emb_dim=128,
+             hid_dim=128,
+             hid_dim2=96,
+             class_dim=2,
+             emb_lr=30.0):
+    """
+    Lstm net
+    """
+    # embedding layer
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    # Lstm layer
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
+
+    lstm_h, c = fluid.layers.dynamic_lstm(
+        input=fc0, size=hid_dim * 4, is_reverse=False)
+
+    # max pooling layer
+    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
+    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+
+    # full connect layer
+    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
+    # softmax layer
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction
+
+
+def bilstm_net(data,
+               label,
+               dict_dim,
+               emb_dim=128,
+               hid_dim=128,
+               hid_dim2=96,
+               class_dim=2,
+               emb_lr=30.0):
+    """
+    Bi-Lstm net
+    """
+    # embedding layer
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    # bi-lstm layer
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
+
+    rfc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
+
+    lstm_h, c = fluid.layers.dynamic_lstm(
+        input=fc0, size=hid_dim * 4, is_reverse=False)
+
+    rlstm_h, c = fluid.layers.dynamic_lstm(
+        input=rfc0, size=hid_dim * 4, is_reverse=True)
+
+    # extract last layer
+    lstm_last = fluid.layers.sequence_last_step(input=lstm_h)
+    rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h)
+
+    lstm_last_tanh = fluid.layers.tanh(lstm_last)
+    rlstm_last_tanh = fluid.layers.tanh(rlstm_last)
+
+    # concat layer
+    lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1)
+
+    # full connect layer
+    fc1 = fluid.layers.fc(input=lstm_concat, size=hid_dim2, act='tanh')
+    # softmax layer
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction
+
+
+def gru_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            emb_lr=30.0):
+    """
+    gru net
+    """
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
+
+    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
+
+    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
+    gru_max_tanh = fluid.layers.tanh(gru_max)
+
+    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
+
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction
--- a/Senta/sentiment_classify.py
+++ b/Senta/sentiment_classify.py
+# coding: utf-8
+import sys
+sys.path.append("../")
+import time
+import unittest
+import contextlib
+import logging
+import argparse
+import ast
+
+import paddle.fluid as fluid
+import paddle_hub as hub
+
+import utils
+from nets import bow_net
+from nets import cnn_net
+from nets import lstm_net
+from nets import bilstm_net
+from nets import gru_net
+
+logger = logging.getLogger("paddle-fluid")
+logger.setLevel(logging.INFO)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Sentiment Classification.")
+    # training data path
+    parser.add_argument(
+        "--train_data_path",
+        type=str,
+        required=False,
+        help="The path of trainning data. Should be given in train mode!")
+    # test data path
+    parser.add_argument(
+        "--test_data_path",
+        type=str,
+        required=False,
+        help="The path of test data. Should be given in eval or infer mode!")
+    # word_dict path
+    parser.add_argument(
+        "--word_dict_path",
+        type=str,
+        required=True,
+        help="The path of word dictionary.")
+    # current mode
+    parser.add_argument(
+        "--mode",
+        type=str,
+        required=True,
+        choices=['train', 'eval', 'infer'],
+        help="train/eval/infer mode")
+    # model type
+    parser.add_argument(
+        "--model_type", type=str, default="bow_net", help="type of model")
+    # model save path
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="models",
+        required=True,
+        help="The path to saved the trained models.")
+    # Number of passes for the training task.
+    parser.add_argument(
+        "--num_passes",
+        type=int,
+        default=10,
+        help="Number of passes for the training task.")
+    # Batch size
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=256,
+        help="The number of training examples in one forward/backward pass.")
+    # lr value for training
+    parser.add_argument(
+        "--lr", type=float, default=0.002, help="The lr value for training.")
+    # Whether to use gpu
+    parser.add_argument(
+        "--use_gpu",
+        type=ast.literal_eval,
+        default=False,
+        help="Whether to use gpu to train the model.")
+    # parallel train
+    parser.add_argument(
+        "--is_parallel",
+        type=ast.literal_eval,
+        default=False,
+        help="Whether to train the model in parallel.")
+    args = parser.parse_args()
+    return args
+
+
+def remove_feed_fetch_op(program):
+    print("remove feed fetch op")
+    block = program.global_block()
+    need_to_remove_op_index = []
+    for i, op in enumerate(block.ops):
+        if op.type == "feed" or op.type == "fetch":
+            need_to_remove_op_index.append(i)
+
+    for index in need_to_remove_op_index[::-1]:
+        block._remove_op(index)
+
+    block._remove_var("feed")
+    block._remove_var("fetch")
+
+    program.desc.flush()
+    print("********************************")
+    print(program)
+    print("********************************")
+
+
+def train_net(train_reader,
+              word_dict,
+              network,
+              use_gpu,
+              parallel,
+              save_dirname,
+              lr=0.002,
+              batch_size=128,
+              pass_num=30):
+    """
+    train network
+    """
+    if network == "bilstm_net":
+        network = bilstm_net
+    elif network == "bow_net":
+        network = bow_net
+    elif network == "cnn_net":
+        network = cnn_net
+    elif network == "lstm_net":
+        network = lstm_net
+    elif network == "gru_net":
+        network = gru_net
+    else:
+        print("unknown network type")
+        return
+    # word seq data
+    # data = fluid.layers.data(
+    #     name="words", shape=[1], dtype="int64", lod_level=1)
+
+    # if not parallel:
+    #     # set network
+    #     cost, acc, pred, emb = network(data, label, len(word_dict) + 2)
+    # else:
+    #     places = fluid.layers.get_places(device_count=2)
+    #     pd = fluid.layers.ParallelDo(places)
+    #     with pd.do():
+    #         # set network
+    #         cost, acc, prediction, emb = network(
+    #             pd.read_input(data), pd.read_input(label),
+    #             len(word_dict) + 2)
+    #         pd.write_output(cost)
+    #         pd.write_output(acc)
+    #     cost, acc = pd()
+    #     cost = fluid.layers.mean(cost)
+    #     acc = fluid.layers.mean(acc)
+
+    dict_dim = len(word_dict) + 2
+    emb_dim = 128
+    hid_dim = 128
+    hid_dim2 = 96
+    class_dim = 2
+
+    module_link = "https://paddlehub.cdn.bcebos.com/senta/bow_module_3.tar.gz"
+    module = hub.Module(module_link)
+
+    main_program = fluid.Program()
+    startup_program = fluid.Program()
+
+    fluid.framework.switch_main_program(module.get_inference_program())
+
+    remove_feed_fetch_op(fluid.default_main_program())
+
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    data = fluid.default_main_program().global_block().var("words")
+    emb = module.get_module_output()
+
+    # # # embedding layer
+    # emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
+    # #input=data, size=[dict_dim, emb_dim], param_attr="bow_embedding")
+    # # bow layer
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    # full connect layer
+    fc_1 = fluid.layers.fc(
+        input=bow_tanh, size=hid_dim, act="tanh", name="bow_fc1")
+    fc_2 = fluid.layers.fc(
+        input=fc_1, size=hid_dim2, act="tanh", name="bow_fc2")
+    # softmax layer
+    pred = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    # print(fluid.default_main_program())
+    cost = fluid.layers.mean(
+        fluid.layers.cross_entropy(input=pred, label=label))
+    acc = fluid.layers.accuracy(input=pred, label=label)
+
+    # Original Senta BoW networks
+    # label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    # data = fluid.layers.data(
+    #     name="words", shape=[1], dtype="int64", lod_level=1)
+    # cost, acc, pred, emb = network(data, label, len(word_dict) + 2)
+
+    # print("new program")
+    # with open("program_senta.prototxt", "w") as fo:
+    #     fo.write(str(fluid.default_main_program()))
+    #     print("program_senta", fluid.default_main_program())
+    with open("senta_load_module.prototxt", "w") as fo:
+        fo.write(str(fluid.default_main_program()))
+        print("senta_load_module", fluid.default_main_program())
+
+    # set optimizer
+    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=lr)
+    sgd_optimizer.minimize(cost)
+
+    # set place, executor, datafeeder
+    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=["words", "label"], place=place)
+    exe.run(fluid.default_startup_program())
+    # start training...
+
+    for pass_id in range(pass_num):
+        data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
+        for batch in train_reader():
+            avg_cost_np, avg_acc_np = exe.run(
+                fluid.default_main_program(),
+                feed=feeder.feed(batch),
+                fetch_list=[cost, acc],
+                return_numpy=True)
+            data_size = len(batch)
+            total_acc += data_size * avg_acc_np
+            total_cost += data_size * avg_cost_np
+            data_count += data_size
+        avg_cost = total_cost / data_count
+        avg_acc = total_acc / data_count
+        print("[train info]: pass_id: %d, avg_acc: %f, avg_cost: %f" %
+              (pass_id, avg_acc, avg_cost))
+
+    # compare program desc here when we change the graph
+    with open("senta_forward_backward_module.prototxt", "w") as fo:
+        fo.write(str(fluid.default_main_program()))
+        # print("senta_load_module", fluid.default_main_program())
+
+    # save the model
+    bow_module_path = save_dirname + "/" + "bow_module"
+    fluid.io.save_inference_model(bow_module_path, ["words"], emb, exe)
+
+
+def eval_net(test_reader, use_gpu, model_path=None):
+    """
+    Evaluation function
+    """
+    if model_path is None:
+        print(str(model_path) + "can not be found")
+        return
+    # set place, executor
+    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # load the saved model
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+
+        # compute 2class and 3class accuracy
+        class2_acc, class3_acc = 0.0, 0.0
+        total_count, neu_count = 0, 0
+
+        for data in test_reader():
+            # infer a batch
+            pred = exe.run(
+                inference_program,
+                feed=utils.data2tensor(data, place),
+                fetch_list=fetch_targets,
+                return_numpy=True)
+            for i, val in enumerate(data):
+                class3_label, class2_label = utils.get_predict_label(
+                    pred[0][i, 1])
+                true_label = val[1]
+                if class2_label == true_label:
+                    class2_acc += 1
+                if class3_label == true_label:
+                    class3_acc += 1
+                if true_label == 1.0:
+                    neu_count += 1
+
+            total_count += len(data)
+
+        class2_acc = class2_acc / (total_count - neu_count)
+        class3_acc = class3_acc / total_count
+        print("[test info] model_path: %s, class2_acc: %f, class3_acc: %f" %
+              (model_path, class2_acc, class3_acc))
+
+
+def infer_net(test_reader, use_gpu, model_path=None):
+    """
+    Inference function
+    """
+    if model_path is None:
+        print(str(model_path) + "can not be found")
+        return
+    # set place, executor
+    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # load the saved model
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+
+        for data in test_reader():
+            # infer a batch
+            pred = exe.run(
+                inference_program,
+                feed=utils.data2tensor(data, place),
+                fetch_list=fetch_targets,
+                return_numpy=True)
+            for i, val in enumerate(data):
+                class3_label, class2_label = utils.get_predict_label(
+                    pred[0][i, 1])
+                pos_prob = pred[0][i, 1]
+                neg_prob = 1 - pos_prob
+                print("predict label: %d, pos_prob: %f, neg_prob: %f" %
+                      (class3_label, pos_prob, neg_prob))
+
+
+def main(args):
+
+    # train mode
+    if args.mode == "train":
+        # prepare_data to get word_dict, train_reader
+        word_dict, train_reader = utils.prepare_data(args.train_data_path,
+                                                     args.word_dict_path,
+                                                     args.batch_size, args.mode)
+
+        train_net(train_reader, word_dict, args.model_type, args.use_gpu,
+                  args.is_parallel, args.model_path, args.lr, args.batch_size,
+                  args.num_passes)
+
+    # eval mode
+    elif args.mode == "eval":
+        # prepare_data to get word_dict, test_reader
+        word_dict, test_reader = utils.prepare_data(args.test_data_path,
+                                                    args.word_dict_path,
+                                                    args.batch_size, args.mode)
+        eval_net(test_reader, args.use_gpu, args.model_path)
+
+    # infer mode
+    elif args.mode == "infer":
+        # prepare_data to get word_dict, test_reader
+        word_dict, test_reader = utils.prepare_data(args.test_data_path,
+                                                    args.word_dict_path,
+                                                    args.batch_size, args.mode)
+        infer_net(test_reader, args.use_gpu, args.model_path)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/Senta/train.sh
+++ b/Senta/train.sh
+python sentiment_classify.py  --train_data_path ./data/train_data/corpus.train --word_dict_path ./data/train.vocab --mode train --model_path ./models                    
--- a/Senta/utils.py
+++ b/Senta/utils.py
+import os
+import sys
+import time
+import numpy as np
+import random
+
+import paddle.fluid as fluid
+import paddle
+
+
+def get_predict_label(pos_prob):
+    neg_prob = 1 - pos_prob
+    # threshold should be (1, 0.5)
+    neu_threshold = 0.55
+    if neg_prob > neu_threshold:
+        class3_label = 0
+    elif pos_prob > neu_threshold:
+        class3_label = 2
+    else:
+        class3_label = 1
+    if pos_prob >= neg_prob:
+        class2_label = 2
+    else:
+        class2_label = 0
+    return class3_label, class2_label
+
+
+def to_lodtensor(data, place):
+    """
+    convert ot LODtensor
+    """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def data2tensor(data, place):
+    """
+    data2tensor
+    """
+    input_seq = to_lodtensor(map(lambda x: x[0], data), place)
+    return {"words": input_seq}
+
+
+def data_reader(file_path, word_dict, is_shuffle=True):
+    """
+    Convert word sequence into slot
+    """
+    unk_id = len(word_dict)
+    all_data = []
+    with open(file_path, "r") as fin:
+        for line in fin:
+            cols = line.strip().split("\t")
+            label = int(cols[0])
+            wids = [
+                word_dict[x] if x in word_dict else unk_id
+                for x in cols[1].split(" ")
+            ]
+            all_data.append((wids, label))
+    if is_shuffle:
+        random.shuffle(all_data)
+
+    def reader():
+        for doc, label in all_data:
+            yield doc, label
+
+    return reader
+
+
+def load_vocab(file_path):
+    """
+    load the given vocabulary
+    """
+    vocab = {}
+    with open(file_path) as f:
+        wid = 0
+        for line in f:
+            vocab[line.strip()] = wid
+            wid += 1
+    vocab["<unk>"] = len(vocab)
+    return vocab
+
+
+def prepare_data(data_path, word_dict_path, batch_size, mode):
+    """
+    prepare data
+    """
+    assert os.path.exists(
+        word_dict_path), "The given word dictionary dose not exist."
+    if mode == "train":
+        assert os.path.exists(
+            data_path), "The given training data does not exist."
+    if mode == "eval" or mode == "infer":
+        assert os.path.exists(data_path), "The given test data does not exist."
+
+    word_dict = load_vocab(word_dict_path)
+    if mode == "train":
+        train_reader = paddle.batch(
+            data_reader(data_path, word_dict, True), batch_size)
+        return word_dict, train_reader
+    else:
+        test_reader = paddle.batch(
+            data_reader(data_path, word_dict, False), batch_size)
+        return word_dict, test_reader
--- a/paddle_hub/module.py
+++ b/paddle_hub/module.py
@@ -51,7 +51,16 @@ class Module(object):
        # load assets
        # self._load_assets(module_dir)

+    #TODO(ZeyuChen): Need add register more signature to execute different
+    # implmentation
    def __call__(self, inputs=None, signature=None):
+        """ Call default signature and return results
+        """
+        # TODO(ZeyuChen): add proto spec to check which task we need to run
+        # if it's NLP word embedding task, then do words preprocessing
+        # if it's image classification or image feature task do the other works
+
+        # if it's 
        word_ids_lod_tensor = self._process_input(inputs)
        np_words_id = np.array(word_ids_lod_tensor)
        print("word_ids_lod_tensor\n", np_words_id)
@@ -160,7 +169,7 @@ class ModuleDesc(object):
            os.makedirs(path)

    @staticmethod
-    def save_dict(path, word_dict):
+    def save_dict(path, word_dict, dict_name="dict.txt"):
        ModuleDesc._mkdir(path)
        with open(os.path.join(path, "tokens.txt"), "w") as fo:
            print("tokens.txt path", os.path.join(path, "tokens.txt"))