提交 3b3d7206 编写于 作者: F frankwhzhang 提交者: Yi Liu

tagspace (#1406)

上级 bc83661a
# TagSpace
以下是本例的简要目录结构及说明:
```text
.
├── README.md # 文档
├── train.py # 训练脚本
├── utils # 通用函数
├── small_train.txt # 小样本训练集
└── small_test.txt # 小样本测试集
```
## 简介
TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Hashtags](https://research.fb.com/publications/tagspace-semantic-embeddings-from-hashtags/),在本例中,我们实现了TagSpace的模型。
## 数据下载
[ag news dataset](https://github.com/mhjabreel/CharCNN/tree/master/data/ag_news_csv)
数据格式如下
```
"3","Wall St. Bears Claw Back Into the Black (Reuters)","Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
```
## 训练
'--use_cuda 1' 表示使用gpu, 缺省表示使用cpu
GPU 环境
运行命令 `CUDA_VISIBLE_DEVICES=0 python train.py train_file test_file --use_cuda 1` 开始训练模型。
```
CUDA_VISIBLE_DEVICES=0 python train.py small_train.txt small_test.txt --use_cuda 1
```
CPU 环境
运行命令 `python train.py train_file test_file` 开始训练模型。
```
python train.py small_train.txt small_test.txt
```
##Future work
infer will be add
Multiple types of pairwise loss will be added in this project.
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers.nn as nn
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
import paddle.fluid.layers.io as io
import time
import utils
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("TagSpace benchmark.")
parser.add_argument('train_file')
parser.add_argument('test_file')
parser.add_argument('--use_cuda', help='whether use gpu')
args = parser.parse_args()
return args
def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1):
""" network definition """
text = io.data(name="text", shape=[1], lod_level=1, dtype='int64')
pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64')
neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64')
text_emb = nn.embedding(
input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb")
pos_tag_emb = nn.embedding(
input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
neg_tag_emb = nn.embedding(
input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
conv_1d = fluid.nets.sequence_conv_pool(
input=text_emb,
num_filters=hid_dim,
filter_size=win_size,
act="tanh",
pool_type="max",
param_attr="cnn")
text_hid = fluid.layers.fc(input=conv_1d, size=emb_dim, param_attr="text_hid")
cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
cos_neg = nn.cos_sim(neg_tag_emb, text_hid)
loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=cos_pos,
shape=[-1, 1],
value=margin,
dtype='float32'),
cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max(
tensor.fill_constant_batch_size_like(
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
avg_cost = nn.mean(loss_part3)
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less)
return text, pos_tag, neg_tag, avg_cost, correct, cos_pos
def train(train_reader, vocab_text, vocab_tag, base_lr, batch_size,
pass_num, use_cuda, model_dir):
""" train network """
args = parse_args()
vocab_text_size = len(vocab_text)
vocab_tag_size = len(vocab_tag)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
# Train program
text, pos_tag, neg_tag, avg_cost, correct, pos_cos = network(vocab_text_size, vocab_tag_size)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=base_lr)
sgd_optimizer.minimize(avg_cost)
# Initialize executor
startup_program = fluid.default_startup_program()
loop_program = fluid.default_main_program()
exe = fluid.Executor(place)
exe.run(startup_program)
total_time = 0.0
for pass_idx in range(pass_num):
epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx)
t0 = time.time()
for batch_id, data in enumerate(train_reader()):
lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place)
lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place)
lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place)
loss_val, correct_val = exe.run(
loop_program,
feed={
"text": lod_text_seq,
"pos_tag": lod_pos_tag,
"neg_tag": lod_neg_tag},
fetch_list=[avg_cost, correct])
if batch_id % 10 == 0:
print("TRAIN --> pass: {} batch_id: {} avg_cost: {}, acc: {}"
.format(pass_idx, batch_id, loss_val,
float(correct_val) / batch_size))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, batch_id, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["text", "pos_tag"]
fetch_vars = [pos_cos]
fluid.io.save_inference_model(save_dir ,feed_var_names, fetch_vars, exe)
print("finish training")
def train_net():
""" do training """
args = parse_args()
train_file = args.train_file
test_file = args.test_file
use_cuda = True if args.use_cuda else False
batch_size = 100
vocab_text, vocab_tag, train_reader, test_reader = utils.prepare_data(
train_file, test_file, batch_size=batch_size, buffer_size=batch_size*100, word_freq_threshold=0)
train(
train_reader=train_reader,
vocab_text=vocab_text,
vocab_tag=vocab_tag,
base_lr=0.01,
batch_size=batch_size,
pass_num=10,
use_cuda=use_cuda,
model_dir="model_dim10_2")
if __name__ == "__main__":
train_net()
import re
import sys
import collections
import six
import time
import numpy as np
import paddle.fluid as fluid
import paddle
import csv
def to_lodtensor(data, place):
""" convert to LODtensor """
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def prepare_data(train_filename,
test_filename,
batch_size,
neg_size=1,
buffer_size=1000,
word_freq_threshold=0,
enable_ce=False):
""" prepare the AG's News Topic Classification data """
print("start constuct word dict")
vocab_text = build_dict(2, word_freq_threshold, train_filename, test_filename)
vocab_tag = build_dict(0, word_freq_threshold, train_filename, test_filename)
print("construct word dict done\n")
train_reader = sort_batch(
paddle.reader.shuffle(
train(
train_filename, vocab_text, vocab_tag, buffer_size, data_type=DataType.SEQ),
buf_size=buffer_size),
batch_size, batch_size * 20)
test_reader = sort_batch(
test(
test_filename, vocab_text, vocab_tag, buffer_size, data_type=DataType.SEQ),
batch_size, batch_size * 20)
return vocab_text, vocab_tag, train_reader, test_reader
def sort_batch(reader, batch_size, sort_group_size, drop_last=False):
"""
Create a batched reader.
:param reader: the data reader to read from.
:type reader: callable
:param batch_size: size of each mini-batch
:type batch_size: int
:param sort_group_size: size of partial sorted batch
:type sort_group_size: int
:param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
:type drop_last: bool
:return: the batched reader.
:rtype: callable
"""
def batch_reader():
r = reader()
b = []
for instance in r:
b.append(instance)
if len(b) == sort_group_size:
sortl = sorted(b, key=lambda x: len(x[0]), reverse=True)
b = []
c = []
for sort_i in sortl:
c.append(sort_i)
if (len(c) == batch_size):
yield c
c = []
if drop_last == False and len(b) != 0:
sortl = sorted(b, key=lambda x: len(x[0]), reverse=True)
c = []
for sort_i in sortl:
c.append(sort_i)
if (len(c) == batch_size):
yield c
c = []
# Batch size check
batch_size = int(batch_size)
if batch_size <= 0:
raise ValueError("batch_size should be a positive integeral value, "
"but got batch_size={}".format(batch_size))
return batch_reader
class DataType(object):
SEQ = 2
def word_count(column_num, input_file, word_freq=None):
"""
compute word count from corpus
"""
if word_freq is None:
word_freq = collections.defaultdict(int)
data_file = csv.reader(input_file)
for row in data_file:
for w in re.split(r'\W+',row[column_num].strip()):
word_freq[w]+= 1
return word_freq
def build_dict(column_num=2, min_word_freq=50, train_filename="", test_filename=""):
"""
Build a word dictionary from the corpus, Keys of the dictionary are words,
and values are zero-based IDs of these words.
"""
with open(train_filename) as trainf:
with open(test_filename) as testf:
word_freq = word_count(column_num, testf, word_count(column_num, trainf))
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(list(zip(words, six.moves.range(len(words)))))
return word_idx
def reader_creator(filename, text_idx, tag_idx, n, data_type):
def reader():
with open(filename) as input_file:
data_file = csv.reader(input_file)
for row in data_file:
text_raw = re.split(r'\W+', row[2].strip())
text = [text_idx.get(w) for w in text_raw]
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
pos_tag=[]
pos_tag.append(pos_index)
neg_tag=[]
max_iter = 100
now_iter = 0
sum_n = 0
while(sum_n < 1) :
now_iter += 1
if now_iter > max_iter:
print("error : only one class")
sys.exit(0)
rand_i = np.random.randint(0, len(tag_idx))
if rand_i != pos_index:
neg_index=rand_i
neg_tag.append(neg_index)
sum_n += 1
if n > 0 and len(text) > n: continue
yield text, pos_tag, neg_tag
return reader
def train(filename, text_idx, tag_idx, n, data_type=DataType.SEQ):
return reader_creator(filename, text_idx, tag_idx, n, data_type)
def test(filename, text_idx, tag_idx, n, data_type=DataType.SEQ):
return reader_creator(filename, text_idx, tag_idx, n, data_type)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册