From c423806524324663aa67a1016af10c0311e9b2e7 Mon Sep 17 00:00:00 2001 From: peterzhang2029 Date: Tue, 30 Jan 2018 18:44:16 +0800 Subject: [PATCH] add text classification --- fluid/text_classification/README.md | 12 +++ fluid/text_classification/config.py | 16 +++ fluid/text_classification/train.py | 158 ++++++++++++++++++++++++++++ 3 files changed, 186 insertions(+) create mode 100644 fluid/text_classification/README.md create mode 100644 fluid/text_classification/config.py create mode 100644 fluid/text_classification/train.py diff --git a/fluid/text_classification/README.md b/fluid/text_classification/README.md new file mode 100644 index 00000000..40df3211 --- /dev/null +++ b/fluid/text_classification/README.md @@ -0,0 +1,12 @@ +# Text Classification + +## Data Preparation +``` +wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz +tar zxf aclImdb_v1.tar.gz +``` + +## Training +``` +python train.py --dict_path 'aclImdb/imdb.vocab' +``` diff --git a/fluid/text_classification/config.py b/fluid/text_classification/config.py new file mode 100644 index 00000000..2aba3247 --- /dev/null +++ b/fluid/text_classification/config.py @@ -0,0 +1,16 @@ +class TrainConfig(object): + + # Whether to use GPU in training or not. + use_gpu = False + + # The training batch size. + batch_size = 4 + + # The epoch number. + num_passes = 30 + + # The global learning rate. + learning_rate = 0.01 + + # Training log will be printed every log_period. + log_period = 100 diff --git a/fluid/text_classification/train.py b/fluid/text_classification/train.py new file mode 100644 index 00000000..5980f71c --- /dev/null +++ b/fluid/text_classification/train.py @@ -0,0 +1,158 @@ +import numpy as np +import sys +import os +import argparse +import time + +import paddle.v2 as paddle +import paddle.v2.fluid as fluid + +from config import TrainConfig as conf + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--dict_path', + type=str, + required=True, + help="Path of the word dictionary.") + return parser.parse_args() + + +# Define to_lodtensor function to process the sequential data. +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +# Load the dictionary. +def load_vocab(filename): + vocab = {} + with open(filename) as f: + wid = 0 + for line in f: + vocab[line.strip()] = wid + wid += 1 + return vocab + + +# Define the convolution model. +def conv_net(dict_dim, + window_size=3, + emb_dim=128, + num_filters=128, + fc0_dim=96, + class_dim=2): + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) + + conv_3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=num_filters, + filter_size=window_size, + act="tanh", + pool_type="max") + + fc_0 = fluid.layers.fc(input=[conv_3], size=fc0_dim) + + prediction = fluid.layers.fc(input=[fc_0], size=class_dim, act="softmax") + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + + avg_cost = fluid.layers.mean(x=cost) + + return data, label, prediction, avg_cost + + +def main(dict_path): + word_dict = load_vocab(dict_path) + word_dict[""] = len(word_dict) + dict_dim = len(word_dict) + print("The dictionary size is : %d" % dict_dim) + + data, label, prediction, avg_cost = conv_net(dict_dim) + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=conf.learning_rate) + sgd_optimizer.minimize(avg_cost) + + accuracy = fluid.evaluator.Accuracy(input=prediction, label=label) + + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + test_target = accuracy.metrics + accuracy.states + inference_program = fluid.io.get_inference_program(test_target) + + # The training data set. + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(word_dict), buf_size=5000), + batch_size=conf.batch_size) + + # The testing data set. + test_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.test(word_dict), buf_size=5000), + batch_size=conf.batch_size) + + if conf.use_gpu: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + + exe = fluid.Executor(place) + + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + + exe.run(fluid.default_startup_program()) + + def test(exe): + accuracy.reset(exe) + for batch_id, data in enumerate(test_reader()): + input_seq = to_lodtensor(map(lambda x: x[0], data), place) + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + acc = exe.run(inference_program, + feed={"words": input_seq, + "label": y_data}) + test_acc = accuracy.eval(exe) + return test_acc + + total_time = 0. + for pass_id in xrange(conf.num_passes): + accuracy.reset(exe) + start_time = time.time() + for batch_id, data in enumerate(train_reader()): + cost_val, acc_val = exe.run( + fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost, accuracy.metrics[0]]) + pass_acc = accuracy.eval(exe) + if batch_id and batch_id % conf.log_period == 0: + print("Pass id: %d, batch id: %d, cost: %f, pass_acc %f" % + (pass_id, batch_id, cost_val, pass_acc)) + end_time = time.time() + total_time += (end_time - start_time) + pass_test_acc = test(exe) + print("Pass id: %d, test_acc: %f" % (pass_id, pass_test_acc)) + print("Total train time: %f" % (total_time)) + + +if __name__ == '__main__': + args = parse_args() + main(args.dict_path) -- GitLab