tagspace (#1406)

3b3d7206 · frankwhzhang · Yi Liu · bc83661a · 3b3d7206 · 3b3d7206
5 changed file
--- a/fluid/PaddleRec/TagSpace/README.md
+++ b/fluid/PaddleRec/TagSpace/README.md
+# TagSpace
+
+以下是本例的简要目录结构及说明：
+
+```text
+.
+├── README.md            # 文档
+├── train.py             # 训练脚本
+├── utils                # 通用函数
+├── small_train.txt      # 小样本训练集
+└── small_test.txt       # 小样本测试集
+
+```
+
+
+## 简介
+
+TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Hashtags](https://research.fb.com/publications/tagspace-semantic-embeddings-from-hashtags/)，在本例中，我们实现了TagSpace的模型。
+## 数据下载
+
+[ag news dataset](https://github.com/mhjabreel/CharCNN/tree/master/data/ag_news_csv)
+
+数据格式如下
+
+```
+"3","Wall St. Bears Claw Back Into the Black (Reuters)","Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
+```
+
+
+## 训练
+'--use_cuda 1' 表示使用gpu, 缺省表示使用cpu 
+
+GPU 环境
+运行命令 `CUDA_VISIBLE_DEVICES=0 python train.py train_file test_file --use_cuda 1` 开始训练模型。
+```
+CUDA_VISIBLE_DEVICES=0 python train.py small_train.txt small_test.txt --use_cuda 1
+```
+CPU 环境
+运行命令 `python train.py train_file test_file` 开始训练模型。
+```
+python train.py small_train.txt small_test.txt
+```
+
+##Future work
+infer will be add
+
+Multiple types of pairwise loss will be added in this project.
+
+
--- a/fluid/PaddleRec/TagSpace/small_test.txt
+++ b/fluid/PaddleRec/TagSpace/small_test.txt
--- a/fluid/PaddleRec/TagSpace/small_train.txt
+++ b/fluid/PaddleRec/TagSpace/small_train.txt
--- a/fluid/PaddleRec/TagSpace/train.py
+++ b/fluid/PaddleRec/TagSpace/train.py
+import os
+import sys
+import time
+import six
+import numpy as np
+import math
+import argparse
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers.nn as nn
+import paddle.fluid.layers.tensor as tensor
+import paddle.fluid.layers.control_flow as cf
+import paddle.fluid.layers.io as io
+import time
+import utils
+
+SEED = 102
+
+def parse_args():
+    parser = argparse.ArgumentParser("TagSpace benchmark.")
+    parser.add_argument('train_file')
+    parser.add_argument('test_file')
+    parser.add_argument('--use_cuda', help='whether use gpu')
+    args = parser.parse_args()
+    return args
+
+def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1):
+    """ network definition """
+    text = io.data(name="text", shape=[1], lod_level=1, dtype='int64')
+    pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64')
+    neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64')
+    
+    text_emb = nn.embedding(
+            input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb")
+    pos_tag_emb = nn.embedding(
+            input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
+    neg_tag_emb = nn.embedding(
+            input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
+
+    conv_1d = fluid.nets.sequence_conv_pool(
+            input=text_emb,
+            num_filters=hid_dim,
+            filter_size=win_size,
+            act="tanh",
+            pool_type="max",
+            param_attr="cnn")
+    
+    text_hid = fluid.layers.fc(input=conv_1d, size=emb_dim, param_attr="text_hid")
+
+    cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
+    cos_neg = nn.cos_sim(neg_tag_emb, text_hid)
+    
+    loss_part1 = nn.elementwise_sub(
+            tensor.fill_constant_batch_size_like(
+                input=cos_pos,
+                shape=[-1, 1],
+                value=margin,
+                dtype='float32'),
+            cos_pos)
+    loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
+    loss_part3 = nn.elementwise_max(
+            tensor.fill_constant_batch_size_like(
+                input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
+            loss_part2)
+    avg_cost = nn.mean(loss_part3)
+
+    less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
+    correct = nn.reduce_sum(less)
+    return text, pos_tag, neg_tag, avg_cost, correct, cos_pos
+
+def train(train_reader, vocab_text, vocab_tag, base_lr, batch_size,
+          pass_num, use_cuda, model_dir):
+    """ train network """
+
+    args = parse_args()
+    vocab_text_size = len(vocab_text)
+    vocab_tag_size = len(vocab_tag)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    # Train program
+    text, pos_tag, neg_tag, avg_cost, correct, pos_cos = network(vocab_text_size, vocab_tag_size)
+
+    # Optimization to minimize lost
+    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=base_lr)
+    sgd_optimizer.minimize(avg_cost)
+
+    # Initialize executor
+    startup_program = fluid.default_startup_program()
+    loop_program = fluid.default_main_program()
+
+    exe = fluid.Executor(place)
+    exe.run(startup_program)
+
+    total_time = 0.0
+    for pass_idx in range(pass_num):
+        epoch_idx = pass_idx + 1
+        print("epoch_%d start" % epoch_idx)
+        t0 = time.time()
+        for batch_id, data in enumerate(train_reader()):
+            lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place)
+            lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place)
+            lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place)
+            loss_val, correct_val = exe.run(
+                    loop_program,
+                    feed={
+                        "text": lod_text_seq,
+                        "pos_tag": lod_pos_tag,
+                        "neg_tag": lod_neg_tag},
+                    fetch_list=[avg_cost, correct])
+            if batch_id % 10 == 0:
+                print("TRAIN --> pass: {} batch_id: {} avg_cost: {}, acc: {}"
+                        .format(pass_idx, batch_id, loss_val,
+                                float(correct_val) / batch_size))
+        t1 = time.time()
+        total_time += t1 - t0
+        print("epoch:%d num_steps:%d time_cost(s):%f" %
+              (epoch_idx, batch_id, total_time / epoch_idx))
+        save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
+        feed_var_names = ["text", "pos_tag"]
+        fetch_vars = [pos_cos]
+        fluid.io.save_inference_model(save_dir ,feed_var_names, fetch_vars, exe)
+    print("finish training")
+
+def train_net():
+    """ do training """
+    args = parse_args()
+    train_file = args.train_file
+    test_file = args.test_file
+    use_cuda = True if args.use_cuda else False
+    batch_size = 100
+    vocab_text, vocab_tag, train_reader, test_reader = utils.prepare_data(
+        train_file, test_file, batch_size=batch_size, buffer_size=batch_size*100, word_freq_threshold=0)
+    train(
+        train_reader=train_reader,
+        vocab_text=vocab_text,
+        vocab_tag=vocab_tag,
+        base_lr=0.01,
+        batch_size=batch_size,
+        pass_num=10,
+        use_cuda=use_cuda,
+        model_dir="model_dim10_2")
+
+
+if __name__ == "__main__":
+    train_net()
--- a/fluid/PaddleRec/TagSpace/utils.py
+++ b/fluid/PaddleRec/TagSpace/utils.py
+import re
+import sys
+import collections
+import six
+import time
+import numpy as np
+import paddle.fluid as fluid
+import paddle
+import csv
+
+def to_lodtensor(data, place):
+    """ convert to LODtensor """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+def prepare_data(train_filename,
+                 test_filename,
+                 batch_size,
+                 neg_size=1,
+                 buffer_size=1000,
+                 word_freq_threshold=0,
+                 enable_ce=False):
+    """ prepare the AG's News Topic Classification data """
+    print("start constuct word dict")
+    vocab_text = build_dict(2, word_freq_threshold, train_filename, test_filename)
+    vocab_tag = build_dict(0, word_freq_threshold, train_filename, test_filename)
+    print("construct word dict done\n")
+    train_reader = sort_batch(
+        paddle.reader.shuffle(
+            train(
+                train_filename, vocab_text, vocab_tag, buffer_size, data_type=DataType.SEQ),
+            buf_size=buffer_size),
+        batch_size, batch_size * 20)
+    test_reader = sort_batch(
+        test(
+            test_filename, vocab_text, vocab_tag, buffer_size, data_type=DataType.SEQ),
+        batch_size, batch_size * 20)
+    return vocab_text, vocab_tag, train_reader, test_reader
+
+def sort_batch(reader, batch_size, sort_group_size, drop_last=False):
+    """
+    Create a batched reader.
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param batch_size: size of each mini-batch
+    :type batch_size: int
+    :param sort_group_size: size of partial sorted batch
+    :type sort_group_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
+    :return: the batched reader.
+    :rtype: callable
+    """
+
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == sort_group_size:
+                sortl = sorted(b, key=lambda x: len(x[0]), reverse=True)
+                b = []
+                c = []
+                for sort_i in sortl:
+                    c.append(sort_i)
+                    if (len(c) == batch_size):
+                        yield c
+                        c = []
+        if drop_last == False and len(b) != 0:
+            sortl = sorted(b, key=lambda x: len(x[0]), reverse=True)
+            c = []
+            for sort_i in sortl:
+                c.append(sort_i)
+                if (len(c) == batch_size):
+                    yield c
+                    c = []
+
+    # Batch size check
+    batch_size = int(batch_size)
+    if batch_size <= 0:
+        raise ValueError("batch_size should be a positive integeral value, "
+                         "but got batch_size={}".format(batch_size))
+    return batch_reader
+
+
+class DataType(object):
+    SEQ = 2
+
+def word_count(column_num, input_file, word_freq=None):
+    """
+    compute word count from corpus
+    """
+    if word_freq is None:
+        word_freq = collections.defaultdict(int)
+    data_file = csv.reader(input_file)
+    for row in data_file:
+        for w in re.split(r'\W+',row[column_num].strip()):
+            word_freq[w]+= 1
+    return word_freq
+
+def build_dict(column_num=2, min_word_freq=50, train_filename="", test_filename=""):
+    """
+    Build a word dictionary from the corpus,  Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    with open(train_filename) as trainf:
+        with open(test_filename) as testf:
+            word_freq = word_count(column_num, testf, word_count(column_num, trainf))
+
+    word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
+    word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+    words, _ = list(zip(*word_freq_sorted))
+    word_idx = dict(list(zip(words, six.moves.range(len(words)))))
+    return word_idx
+
+def reader_creator(filename, text_idx, tag_idx, n, data_type):
+    def reader():
+        with open(filename) as input_file:
+            data_file = csv.reader(input_file)
+            for row in data_file:
+                text_raw = re.split(r'\W+', row[2].strip())
+                text = [text_idx.get(w) for w in text_raw]
+                tag_raw = re.split(r'\W+', row[0].strip())
+                pos_index = tag_idx.get(tag_raw[0])
+                pos_tag=[]
+                pos_tag.append(pos_index)
+                neg_tag=[]
+                max_iter = 100
+                now_iter = 0
+                sum_n = 0
+                while(sum_n < 1) :
+                    now_iter += 1
+                    if now_iter > max_iter:
+                        print("error : only one class")
+                        sys.exit(0)
+                    rand_i = np.random.randint(0, len(tag_idx))
+                    if rand_i != pos_index:
+                        neg_index=rand_i
+                        neg_tag.append(neg_index)
+                        sum_n += 1
+                if n > 0 and len(text) > n: continue
+                yield text, pos_tag, neg_tag
+    return reader
+
+def train(filename, text_idx, tag_idx, n, data_type=DataType.SEQ):
+    return reader_creator(filename, text_idx, tag_idx, n, data_type)
+
+def test(filename, text_idx, tag_idx, n, data_type=DataType.SEQ):
+    return reader_creator(filename, text_idx, tag_idx, n, data_type)