From c423806524324663aa67a1016af10c0311e9b2e7 Mon Sep 17 00:00:00 2001
From: peterzhang2029 <zhangchao41@baidu.com>
Date: Tue, 30 Jan 2018 18:44:16 +0800
Subject: [PATCH] add text classification

---
 fluid/text_classification/README.md |  12 +++
 fluid/text_classification/config.py |  16 +++
 fluid/text_classification/train.py  | 158 ++++++++++++++++++++++++++++
 3 files changed, 186 insertions(+)
 create mode 100644 fluid/text_classification/README.md
 create mode 100644 fluid/text_classification/config.py
 create mode 100644 fluid/text_classification/train.py
diff --git a/fluid/text_classification/README.md b/fluid/text_classification/README.md
new file mode 100644
index 00000000..40df3211
--- /dev/null
+++ b/fluid/text_classification/README.md
@@ -0,0 +1,12 @@
+# Text Classification
+
+## Data Preparation
+```
+wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
+tar zxf aclImdb_v1.tar.gz
+```
+
+## Training
+```
+python train.py --dict_path 'aclImdb/imdb.vocab'
+```
diff --git a/fluid/text_classification/config.py b/fluid/text_classification/config.py
new file mode 100644
index 00000000..2aba3247
--- /dev/null
+++ b/fluid/text_classification/config.py
@@ -0,0 +1,16 @@
+class TrainConfig(object):
+
+    # Whether to use GPU in training or not.
+    use_gpu = False
+
+    # The training batch size.
+    batch_size = 4
+
+    # The epoch number.
+    num_passes = 30
+
+    # The global learning rate.
+    learning_rate = 0.01
+
+    # Training log will be printed every log_period.
+    log_period = 100
diff --git a/fluid/text_classification/train.py b/fluid/text_classification/train.py
new file mode 100644
index 00000000..5980f71c
--- /dev/null
+++ b/fluid/text_classification/train.py
@@ -0,0 +1,158 @@
+import numpy as np
+import sys
+import os
+import argparse
+import time
+
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+from config import TrainConfig as conf
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--dict_path',
+        type=str,
+        required=True,
+        help="Path of the word dictionary.")
+    return parser.parse_args()
+
+
+# Define to_lodtensor function to process the sequential data.
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+# Load the dictionary.
+def load_vocab(filename):
+    vocab = {}
+    with open(filename) as f:
+        wid = 0
+        for line in f:
+            vocab[line.strip()] = wid
+            wid += 1
+    return vocab
+
+
+# Define the convolution model.
+def conv_net(dict_dim,
+             window_size=3,
+             emb_dim=128,
+             num_filters=128,
+             fc0_dim=96,
+             class_dim=2):
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
+
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=num_filters,
+        filter_size=window_size,
+        act="tanh",
+        pool_type="max")
+
+    fc_0 = fluid.layers.fc(input=[conv_3], size=fc0_dim)
+
+    prediction = fluid.layers.fc(input=[fc_0], size=class_dim, act="softmax")
+
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+
+    avg_cost = fluid.layers.mean(x=cost)
+
+    return data, label, prediction, avg_cost
+
+
+def main(dict_path):
+    word_dict = load_vocab(dict_path)
+    word_dict["<unk>"] = len(word_dict)
+    dict_dim = len(word_dict)
+    print("The dictionary size is : %d" % dict_dim)
+
+    data, label, prediction, avg_cost = conv_net(dict_dim)
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=conf.learning_rate)
+    sgd_optimizer.minimize(avg_cost)
+
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        test_target = accuracy.metrics + accuracy.states
+        inference_program = fluid.io.get_inference_program(test_target)
+
+    # The training data set.
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=5000),
+        batch_size=conf.batch_size)
+
+    # The testing data set.
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.test(word_dict), buf_size=5000),
+        batch_size=conf.batch_size)
+
+    if conf.use_gpu:
+        place = fluid.CUDAPlace(0)
+    else:
+        place = fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+    exe.run(fluid.default_startup_program())
+
+    def test(exe):
+        accuracy.reset(exe)
+        for batch_id, data in enumerate(test_reader()):
+            input_seq = to_lodtensor(map(lambda x: x[0], data), place)
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+            acc = exe.run(inference_program,
+                          feed={"words": input_seq,
+                                "label": y_data})
+        test_acc = accuracy.eval(exe)
+        return test_acc
+
+    total_time = 0.
+    for pass_id in xrange(conf.num_passes):
+        accuracy.reset(exe)
+        start_time = time.time()
+        for batch_id, data in enumerate(train_reader()):
+            cost_val, acc_val = exe.run(
+                fluid.default_main_program(),
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost, accuracy.metrics[0]])
+            pass_acc = accuracy.eval(exe)
+            if batch_id and batch_id % conf.log_period == 0:
+                print("Pass id: %d, batch id: %d, cost: %f, pass_acc %f" %
+                      (pass_id, batch_id, cost_val, pass_acc))
+        end_time = time.time()
+        total_time += (end_time - start_time)
+        pass_test_acc = test(exe)
+        print("Pass id: %d, test_acc: %f" % (pass_id, pass_test_acc))
+    print("Total train time: %f" % (total_time))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args.dict_path)
-- 
GitLab