“1295e5ef5467a0a068179da243c20bc05e61f921”上不存在“python/paddle/v2/fluid/tests/unittests/test_reduce_op.py”
提交 bb61076b 编写于 作者: F frankwhzhang

add tagspace for 1.2

上级 25fd8885
......@@ -75,76 +75,51 @@ def train():
exe = fluid.Executor(place)
if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda, loss_name=avg_cost.name)
use_cuda=use_cuda,
loss_name=avg_cost.name)
else:
train_exe = exe
def train_loop(main_program):
""" train network """
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
exe.run(fluid.default_startup_program())
total_time = 0.0
for pass_idx in six.moves.xrange(pass_num):
epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx)
exe.run(fluid.default_startup_program())
total_time = 0.0
for pass_idx in six.moves.xrange(pass_num):
epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx)
t0 = time.time()
i = 0
newest_ppl = 0
for data in train_reader():
i += 1
lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data],
place)
lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data],
place)
ret_avg_cost = train_exe.run(main_program,
feed={ "src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq},
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
if i % args.print_batch == 0:
print("step:%d ppl:%.3f" % (i, newest_ppl))
t0 = time.time()
i = 0
newest_ppl = 0
for data in train_reader():
i += 1
lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data],
place)
lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data],
place)
ret_avg_cost = train_exe.run(feed={
"src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq},
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
if i % args.print_batch == 0:
print("step:%d ppl:%.3f" % (i, newest_ppl))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, i, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost, acc]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("model saved in %s" % save_dir)
#exe.close()
print("finish training")
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, i, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost, acc]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("model saved in %s" % save_dir)
#exe.close()
print("finish training")
if args.is_local:
print("run local training")
train_loop(fluid.default_main_program())
else:
print("run distribute training")
port = os.getenv("PADDLE_PORT", "6174")
pserver_ips = os.getenv("PADDLE_PSERVERS") # ip,ip...
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint,
pserver_prog)
exe.run(pserver_startup)
exe.run(pserver_prog)
elif training_role == "TRAINER":
train_loop(t.get_trainer_program())
if __name__ == "__main__":
train()
......@@ -7,17 +7,27 @@
├── README.md # 文档
├── train.py # 训练脚本
├── infer.py # 预测脚本
├── net.py # 网络结构
├── text2paddle.py # 文本数据转paddle数据
├── cluster_train.py # 多机训练
├── cluster_train.sh # 多机训练脚本
├── utils # 通用函数
├── small_train.txt # 小样本训练集
└── small_test.txt # 小样本测试集
├── vocab_text.txt # 小样本文本字典
├── vocab_tag.txt # 小样本类别字典
├── train_data # 小样本训练目录
└── test_data # 小样本测试目录
```
## 简介
TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Hashtags](https://research.fb.com/publications/tagspace-semantic-embeddings-from-hashtags/),在本例中,我们实现了TagSpace的模型。
## 数据下载
TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Hashtags](https://research.fb.com/publications/tagspace-semantic-embeddings-from-hashtags/)
Tagspace模型学习文本及标签的embedding表示,应用于工业级的标签推荐,具体应用场景有feed新闻标签推荐。
## 数据下载及预处理
[ag news dataset](https://github.com/mhjabreel/CharCNN/tree/master/data/ag_news_csv)
......@@ -27,23 +37,54 @@ TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Ha
"3","Wall St. Bears Claw Back Into the Black (Reuters)","Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
```
## 训练
'--use_cuda 1' 表示使用gpu, 缺省表示使用cpu
将文本数据转为paddle数据,先将数据放到训练数据目录和测试数据目录
```
mv train.csv raw_big_train_data
mv test.csv raw_big_test_data
```
运行脚本text2paddle.py 生成paddle输入格式
```
python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test_big_data big_vocab_text.txt big_vocab_tag.txt
```
## 单机训练
'--use_cuda 1' 表示使用gpu, 0表示使用cpu, '--parallel 1' 表示使用多卡
小数据训练(样例中的数据已经准备,可跳过上一节的数据准备,直接运行命令)
GPU 环境
运行命令 `CUDA_VISIBLE_DEVICES=0 python train.py train_file test_file --use_cuda 1` 开始训练模型。
```
CUDA_VISIBLE_DEVICES=0 python train.py small_train.txt small_test.txt --use_cuda 1
CUDA_VISIBLE_DEVICES=0 python train.py --use_cuda 1
```
CPU 环境
运行命令 `python train.py train_file test_file` 开始训练模型。
```
python train.py small_train.txt small_test.txt
python train.py
```
全量数据单机单卡训练
```
CUDA_VISIBLE_DEVICES=0 python train.py --use_cuda 1 --train_dir train_big_data/ --vocab_text_path big_vocab_text.txt --vocab_tag_path big_vocab_tag.txt --model_dir big_model --batch_size 500
```
全量数据单机多卡训练
```
python train.py --train_dir train_big_data/ --vocab_text_path big_vocab_text.txt --vocab_tag_path big_vocab_tag.txt --model_dir big_model --batch_size 500 --parallel 1
```
## 预测
小数据预测
```
python infer.py
```
全量数据预测
```
CUDA_VISIBLE_DEVICES=0 python infer.py model/ 1 10 small_train.txt small_test.txt --use_cuda 1
python infer.py --model_dir big_model --vocab_tag_path big_vocab_tag.txt --test_dir test_big_data/
```
## 本地模拟多机
运行命令
```
sh cluster_train.py
```
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import paddle
import paddle.fluid as fluid
import time
import utils
import net
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("TagSpace benchmark.")
parser.add_argument(
'--neg_size', type=int, default=3, help='neg/pos ratio')
parser.add_argument(
'--train_dir', type=str, default='train_data', help='train file address')
parser.add_argument(
'--vocab_text_path', type=str, default='vocab_text.txt', help='vocab_text file address')
parser.add_argument(
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab_text file address')
parser.add_argument(
'--is_local', type=int, default=1, help='whether local')
parser.add_argument(
'--model_dir', type=str, default='model_', help='model dir')
parser.add_argument(
'--batch_size', type=int, default=5, help='num of batch size')
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument(
'--pass_num', type=int, default=10, help='num of epoch')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether use gpu')
parser.add_argument(
'--base_lr', type=float, default=0.01, help='learning rate')
parser.add_argument(
'--num_devices', type=int, default=1, help='Number of GPU devices')
parser.add_argument(
'--role', type=str, default='pserver', help='trainer or pserver')
parser.add_argument(
'--endpoints', type=str, default='127.0.0.1:6000', help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
parser.add_argument(
'--current_endpoint', type=str, default='127.0.0.1:6000', help='The current_endpoint')
parser.add_argument(
'--trainer_id', type=int, default=0, help='trainer id ,only trainer_id=0 save model')
parser.add_argument(
'--trainers', type=int, default=1, help='The num of trianers, (default: 1)')
args = parser.parse_args()
return args
def get_cards(args):
return args.num_devices
def train():
""" do training """
args = parse_args()
train_dir = args.train_dir
vocab_text_path = args.vocab_text_path
vocab_tag_path = args.vocab_tag_path
use_cuda = True if args.use_cuda else False
batch_size = args.batch_size
neg_size = args.neg_size
vocab_text_size, vocab_tag_size, train_reader = utils.prepare_data(
file_dir=train_dir, vocab_text_path=vocab_text_path,
vocab_tag_path=vocab_tag_path, neg_size=neg_size,
batch_size=batch_size * get_cards(args),
buffer_size=batch_size*100, is_train=True)
""" train network """
# Train program
text, pos_tag, neg_tag, avg_cost, correct, cos_pos = net.network(vocab_text_size, vocab_tag_size, neg_size=neg_size)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.SGD(learning_rate=args.base_lr)
sgd_optimizer.minimize(avg_cost)
def train_loop(main_program):
# Initialize executor
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
exe.run(fluid.default_startup_program())
total_time = 0.0
for pass_idx in range(pass_num):
epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx)
t0 = time.time()
for batch_id, data in enumerate(train_reader()):
lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place)
lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place)
lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place)
loss_val, correct_val = exe.run(
feed={
"text": lod_text_seq,
"pos_tag": lod_pos_tag,
"neg_tag": lod_neg_tag},
fetch_list=[avg_cost.name, correct.name])
if batch_id % args.print_batch == 0:
print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}"
.format(pass_idx, (batch_id+10) * batch_size, np.mean(loss_val),
float(np.sum(correct_val)) / batch_size))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, batch_id, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["text", "pos_tag"]
fetch_vars = [cos_pos]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("finish training")
if args.is_local:
print("run local training")
train_loop(fluid.default_main_program())
else:
print("run distribute training")
t = fluid.DistributeTranspiler()
t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
if args.role == "pserver":
print("run psever")
pserver_prog = t.get_pserver_program(args.current_endpoint)
pserver_startup = t.get_startup_program(args.current_endpoint,
pserver_prog)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(pserver_startup)
exe.run(pserver_prog)
elif args.role == "trainer":
print("run trainer")
train_loop(t.get_trainer_program())
if __name__ == "__main__":
train()
#!/bin/bash
#export GLOG_v=30
#export GLOG_logtostderr=1
# start pserver0
python cluster_train.py \
--train_dir train_data \
--model_dir cluster_model \
--batch_size 5 \
--is_local 0 \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6000 \
--trainers 2 \
> pserver0.log 2>&1 &
# start pserver1
python cluster_train.py \
--train_dir train_data \
--model_dir cluster_model \
--batch_size 5 \
--is_local 0 \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6001 \
--trainers 2 \
> pserver1.log 2>&1 &
# start trainer0
#CUDA_VISIBLE_DEVICES=1 python cluster_train.py \
python cluster_train.py \
--train_dir train_data \
--model_dir cluster_model \
--batch_size 5 \
--print_batch 10 \
--use_cuda 0 \
--is_local 0 \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 0 \
> trainer0.log 2>&1 &
# start trainer1
#CUDA_VISIBLE_DEVICES=2 python cluster_train.py \
python cluster_train.py \
--train_dir train_data \
--model_dir cluster_model \
--batch_size 5 \
--print_batch 10 \
--use_cuda 0 \
--is_local 0 \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 1 \
> trainer1.log 2>&1 &
import sys
import argparse
import time
import math
import unittest
......@@ -9,6 +10,25 @@ import paddle.fluid as fluid
import paddle
import utils
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument(
'--test_dir', type=str, default='test_data', help='test file address')
parser.add_argument(
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab path')
parser.add_argument(
'--start_index', type=int, default='1', help='start index')
parser.add_argument(
'--last_index', type=int, default='10', help='end index')
parser.add_argument(
'--model_dir', type=str, default='model_', help='model dir')
parser.add_argument(
'--use_cuda', type=int, default='0', help='whether use cuda')
parser.add_argument(
'--batch_size', type=int, default='5', help='batch_size')
args = parser.parse_args()
return args
def infer(test_reader, vocab_tag, use_cuda, model_path):
""" inference function """
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......@@ -21,7 +41,7 @@ def infer(test_reader, vocab_tag, use_cuda, model_path):
step_id = 0
true_num = 0
all_num = 0
size = len(vocab_tag)
size = vocab_tag
value = []
for data in test_reader():
step_id += 1
......@@ -47,30 +67,18 @@ def infer(test_reader, vocab_tag, use_cuda, model_path):
t1 = time.time()
if __name__ == "__main__":
if len(sys.argv) != 6:
print(
"Usage: %s model_dir start_epoch last_epoch(inclusive) train_file test_file"
)
exit(0)
train_file = ""
test_file = ""
model_dir = sys.argv[1]
try:
start_index = int(sys.argv[2])
last_index = int(sys.argv[3])
train_file = sys.argv[4]
test_file = sys.argv[5]
except:
print(
"Usage: %s model_dir start_ipoch last_epoch(inclusive) train_file test_file"
)
exit(-1)
vocab_text, vocab_tag, train_reader, test_reader = utils.prepare_data(
train_file,
test_file,
batch_size=1,
buffer_size=1000,
word_freq_threshold=0)
args = parse_args()
start_index = args.start_index
last_index = args.last_index
test_dir = args.test_dir
model_dir = args.model_dir
batch_size = args.batch_size
vocab_tag_path = args.vocab_tag_path
use_cuda = True if args.use_cuda else False
print("start index: ", start_index, " last_index:" ,last_index)
vocab_text, vocab_tag, test_reader = utils.prepare_data(
test_dir, "", vocab_tag_path, batch_size=1,
neg_size=0, buffer_size=1000, is_train=False)
for epoch in xrange(start_index, last_index + 1):
epoch_path = model_dir + "/epoch_" + str(epoch)
......
import paddle.fluid as fluid
import paddle.fluid.layers.nn as nn
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
import paddle.fluid.layers.io as io
def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1, neg_size=5):
""" network definition """
text = io.data(name="text", shape=[1], lod_level=1, dtype='int64')
pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64')
neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64')
text_emb = nn.embedding(
input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb")
pos_tag_emb = nn.embedding(
input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
neg_tag_emb = nn.embedding(
input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
conv_1d = fluid.nets.sequence_conv_pool(
input=text_emb,
num_filters=hid_dim,
filter_size=win_size,
act="tanh",
pool_type="max",
param_attr="cnn")
text_hid = fluid.layers.fc(input=conv_1d, size=emb_dim, param_attr="text_hid")
cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb)
mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid)
cos_neg_all = fluid.layers.sequence_reshape(input=mul_cos_neg, new_dim=neg_size)
#choose max negtive cosine
cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True)
#calculate hinge loss
loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=cos_pos,
shape=[-1, 1],
value=margin,
dtype='float32'),
cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max(
tensor.fill_constant_batch_size_like(
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
avg_cost = nn.mean(loss_part3)
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less)
return text, pos_tag, neg_tag, avg_cost, correct, cos_pos
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
此差异已折叠。
import sys
import six
import collections
import os
import csv
import re
def word_count(column_num, input_file, word_freq=None):
"""
compute word count from corpus
"""
if word_freq is None:
word_freq = collections.defaultdict(int)
data_file = csv.reader(input_file)
for row in data_file:
for w in re.split(r'\W+',row[column_num].strip()):
word_freq[w]+= 1
return word_freq
def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
"""
Build a word dictionary from the corpus, Keys of the dictionary are words,
and values are zero-based IDs of these words.
"""
word_freq = collections.defaultdict(int)
files = os.listdir(train_dir)
for fi in files:
with open(train_dir + '/' + fi, "r") as f:
word_freq = word_count(column_num, f, word_freq)
files = os.listdir(test_dir)
for fi in files:
with open(test_dir + '/' + fi, "r") as f:
word_freq = word_count(column_num, f, word_freq)
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(list(zip(words, six.moves.range(len(words)))))
return word_idx
def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, output_test_dir):
files = os.listdir(train_dir)
if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir)
for fi in files:
with open(train_dir + '/' + fi, "r") as f:
with open(output_train_dir + '/' + fi, "w") as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
wf.write(str(pos_index) + ",")
text_raw = re.split(r'\W+', row[2].strip())
l = [text_idx.get(w) for w in text_raw]
for w in l:
wf.write(str(w) + " ")
wf.write("\n")
files = os.listdir(test_dir)
if not os.path.exists(output_test_dir):
os.mkdir(output_test_dir)
for fi in files:
with open(test_dir + '/' + fi, "r") as f:
with open(output_test_dir + '/' + fi, "w") as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
wf.write(str(pos_index) + ",")
text_raw = re.split(r'\W+', row[2].strip())
l = [text_idx.get(w) for w in text_raw]
for w in l:
wf.write(str(w) + " ")
wf.write("\n")
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag):
print("start constuct word dict")
vocab_text = build_dict(2, 0, train_dir, test_dir)
with open(output_vocab_text, "w") as wf:
wf.write(str(len(vocab_text)) + "\n")
vocab_tag = build_dict(0, 0, train_dir, test_dir)
with open(output_vocab_tag, "w") as wf:
wf.write(str(len(vocab_tag)) + "\n")
print("construct word dict done\n")
write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir, output_test_dir)
train_dir = sys.argv[1]
test_dir = sys.argv[2]
output_train_dir = sys.argv[3]
output_test_dir = sys.argv[4]
output_vocab_text = sys.argv[5]
output_vocab_tag = sys.argv[6]
text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag)
......@@ -7,90 +7,83 @@ import math
import argparse
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers.nn as nn
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
import paddle.fluid.layers.io as io
import time
import utils
import net
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("TagSpace benchmark.")
parser.add_argument('train_file')
parser.add_argument('test_file')
parser.add_argument('--use_cuda', help='whether use gpu')
parser.add_argument(
'--neg_size', type=int, default=3, help='neg/pos ratio')
parser.add_argument(
'--train_dir', type=str, default='train_data', help='train file address')
parser.add_argument(
'--vocab_text_path', type=str, default='vocab_text.txt', help='vocab_text file address')
parser.add_argument(
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab_text file address')
parser.add_argument(
'--model_dir', type=str, default='model_', help='model dir')
parser.add_argument(
'--batch_size', type=int, default=5, help='num of batch size')
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument(
'--pass_num', type=int, default=10, help='num of epoch')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether use gpu')
parser.add_argument(
'--parallel', type=int, default=0, help='whether parallel')
parser.add_argument(
'--base_lr', type=float, default=0.01, help='learning rate')
parser.add_argument(
'--num_devices', type=int, default=1, help='Number of GPU devices')
args = parser.parse_args()
return args
def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1, neg_size=5):
""" network definition """
text = io.data(name="text", shape=[1], lod_level=1, dtype='int64')
pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64')
neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64')
text_emb = nn.embedding(
input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb")
pos_tag_emb = nn.embedding(
input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
neg_tag_emb = nn.embedding(
input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
conv_1d = fluid.nets.sequence_conv_pool(
input=text_emb,
num_filters=hid_dim,
filter_size=win_size,
act="tanh",
pool_type="max",
param_attr="cnn")
text_hid = fluid.layers.fc(input=conv_1d, size=emb_dim, param_attr="text_hid")
cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb)
mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid)
cos_neg_all = fluid.layers.sequence_reshape(input=mul_cos_neg, new_dim=neg_size)
#choose max negtive cosine
cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True)
#calculate hinge loss
loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=cos_pos,
shape=[-1, 1],
value=margin,
dtype='float32'),
cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max(
tensor.fill_constant_batch_size_like(
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
avg_cost = nn.mean(loss_part3)
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less)
return text, pos_tag, neg_tag, avg_cost, correct, cos_pos
def get_cards(args):
return args.num_devices
def train(train_reader, vocab_text, vocab_tag, base_lr, batch_size, neg_size,
pass_num, use_cuda, model_dir):
""" train network """
def train():
""" do training """
args = parse_args()
vocab_text_size = len(vocab_text)
vocab_tag_size = len(vocab_tag)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
train_dir = args.train_dir
vocab_text_path = args.vocab_text_path
vocab_tag_path = args.vocab_tag_path
use_cuda = True if args.use_cuda else False
parallel = True if args.parallel else False
batch_size = args.batch_size
neg_size = args.neg_size
print("use_cuda: {}, parallel: {}, batch_size: {}, neg_size: {} "
.format(use_cuda, parallel, batch_size, neg_size))
vocab_text_size, vocab_tag_size, train_reader = utils.prepare_data(
file_dir=train_dir, vocab_text_path=vocab_text_path,
vocab_tag_path=vocab_tag_path, neg_size=neg_size,
batch_size=batch_size * get_cards(args),
buffer_size=batch_size*100, is_train=True)
""" train network """
# Train program
text, pos_tag, neg_tag, avg_cost, correct, cos_pos = network(vocab_text_size, vocab_tag_size, neg_size=neg_size)
text, pos_tag, neg_tag, avg_cost, correct, cos_pos = net.network(vocab_text_size, vocab_tag_size, neg_size=neg_size)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=base_lr)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
sgd_optimizer.minimize(avg_cost)
# Initialize executor
startup_program = fluid.default_startup_program()
loop_program = fluid.default_main_program()
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_program)
if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
loss_name=avg_cost.name)
else:
train_exe = exe
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
exe.run(fluid.default_startup_program())
total_time = 0.0
for pass_idx in range(pass_num):
epoch_idx = pass_idx + 1
......@@ -100,17 +93,16 @@ def train(train_reader, vocab_text, vocab_tag, base_lr, batch_size, neg_size,
lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place)
lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place)
lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place)
loss_val, correct_val = exe.run(
loop_program,
loss_val, correct_val = train_exe.run(
feed={
"text": lod_text_seq,
"pos_tag": lod_pos_tag,
"neg_tag": lod_neg_tag},
fetch_list=[avg_cost, correct])
if batch_id % 10 == 0:
print("TRAIN --> pass: {} batch_id: {} avg_cost: {}, acc: {}"
.format(pass_idx, batch_id, loss_val,
float(correct_val) / batch_size))
fetch_list=[avg_cost.name, correct.name])
if batch_id % args.print_batch == 0:
print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}"
.format(pass_idx, (batch_id+10) * batch_size, np.mean(loss_val),
float(np.sum(correct_val)) / batch_size))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
......@@ -121,27 +113,5 @@ def train(train_reader, vocab_text, vocab_tag, base_lr, batch_size, neg_size,
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("finish training")
def train_net():
""" do training """
args = parse_args()
train_file = args.train_file
test_file = args.test_file
use_cuda = True if args.use_cuda else False
batch_size = 100
neg_size = 3
vocab_text, vocab_tag, train_reader, test_reader = utils.prepare_data(
train_file, test_file, neg_size=neg_size, batch_size=batch_size, buffer_size=batch_size*100, word_freq_threshold=0)
train(
train_reader=train_reader,
vocab_text=vocab_text,
vocab_tag=vocab_tag,
base_lr=0.01,
batch_size=batch_size,
neg_size=neg_size,
pass_num=10,
use_cuda=use_cuda,
model_dir="model")
if __name__ == "__main__":
train_net()
train()
此差异已折叠。
import re
import sys
import collections
import os
import six
import time
import numpy as np
......@@ -23,30 +24,39 @@ def to_lodtensor(data, place):
res.set_lod([lod])
return res
def prepare_data(train_filename,
test_filename,
def get_vocab_size(vocab_path):
with open(vocab_path, "r") as rf:
line = rf.readline()
return int(line.strip())
def prepare_data(file_dir,
vocab_text_path,
vocab_tag_path,
batch_size,
neg_size=1,
buffer_size=1000,
word_freq_threshold=0,
enable_ce=False):
neg_size,
buffer_size,
is_train=True):
""" prepare the AG's News Topic Classification data """
print("start constuct word dict")
vocab_text = build_dict(2, word_freq_threshold, train_filename, test_filename)
vocab_tag = build_dict(0, word_freq_threshold, train_filename, test_filename)
print("construct word dict done\n")
train_reader = sort_batch(
paddle.reader.shuffle(
train(
train_filename, vocab_text, vocab_tag, neg_size,
buffer_size, data_type=DataType.SEQ),
buf_size=buffer_size),
batch_size, batch_size * 20)
test_reader = sort_batch(
test(
test_filename, vocab_text, vocab_tag, neg_size, buffer_size, data_type=DataType.SEQ),
batch_size, batch_size * 20)
return vocab_text, vocab_tag, train_reader, test_reader
print("start read file")
if is_train:
vocab_text_size = get_vocab_size(vocab_text_path)
vocab_tag_size = get_vocab_size(vocab_tag_path)
reader = sort_batch(
paddle.reader.shuffle(
train(
file_dir, vocab_tag_size, neg_size,
buffer_size, data_type=DataType.SEQ),
buf_size=buffer_size),
batch_size, batch_size * 20)
else:
vocab_tag_size = get_vocab_size(vocab_tag_path)
vocab_text_size = 0
reader = sort_batch(
test(
file_dir, vocab_tag_size, buffer_size, data_type=DataType.SEQ),
batch_size, batch_size * 20)
return vocab_text_size, vocab_tag_size, reader
def sort_batch(reader, batch_size, sort_group_size, drop_last=False):
"""
......@@ -97,82 +107,57 @@ def sort_batch(reader, batch_size, sort_group_size, drop_last=False):
class DataType(object):
SEQ = 2
def word_count(column_num, input_file, word_freq=None):
"""
compute word count from corpus
"""
if word_freq is None:
word_freq = collections.defaultdict(int)
data_file = csv.reader(input_file)
for row in data_file:
for w in re.split(r'\W+',row[column_num].strip()):
word_freq[w]+= 1
return word_freq
def build_dict(column_num=2, min_word_freq=50, train_filename="", test_filename=""):
"""
Build a word dictionary from the corpus, Keys of the dictionary are words,
and values are zero-based IDs of these words.
"""
with open(train_filename) as trainf:
with open(test_filename) as testf:
word_freq = word_count(column_num, testf, word_count(column_num, trainf))
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(list(zip(words, six.moves.range(len(words)))))
return word_idx
def train_reader_creator(filename, text_idx, tag_idx, neg_size, n, data_type):
def train_reader_creator(file_dir, tag_size, neg_size, n, data_type):
def reader():
with open(filename) as input_file:
data_file = csv.reader(input_file)
for row in data_file:
text_raw = re.split(r'\W+', row[2].strip())
text = [text_idx.get(w) for w in text_raw]
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
pos_tag=[]
pos_tag.append(pos_index)
neg_tag=[]
max_iter = 100
now_iter = 0
sum_n = 0
while(sum_n < neg_size) :
now_iter += 1
if now_iter > max_iter:
print("error : only one class")
sys.exit(0)
rand_i = np.random.randint(0, len(tag_idx))
if rand_i != pos_index:
neg_index=rand_i
neg_tag.append(neg_index)
sum_n += 1
if n > 0 and len(text) > n: continue
yield text, pos_tag, neg_tag
files = os.listdir(file_dir)
for fi in files:
with open(file_dir + '/' + fi, "r") as f:
for l in f:
l = l.strip().split(",")
pos_index = int(l[0])
pos_tag = []
pos_tag.append(pos_index)
text_raw = l[1].split()
text = [int(w) for w in text_raw]
neg_tag = []
max_iter = 100
now_iter = 0
sum_n = 0
while(sum_n < neg_size) :
now_iter += 1
if now_iter > max_iter:
print("error : only one class")
sys.exit(0)
rand_i = np.random.randint(0, tag_size)
if rand_i != pos_index:
neg_index = rand_i
neg_tag.append(neg_index)
sum_n += 1
if n > 0 and len(text) > n: continue
yield text, pos_tag, neg_tag
return reader
def test_reader_creator(filename, text_idx, tag_idx, n, data_type):
def test_reader_creator(file_dir, tag_size, n, data_type):
def reader():
with open(filename) as input_file:
data_file = csv.reader(input_file)
for row in data_file:
text_raw = re.split(r'\W+', row[2].strip())
text = [text_idx.get(w) for w in text_raw]
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
pos_tag = []
pos_tag.append(pos_index)
for ii in range(len(tag_idx)):
tag = []
tag.append(ii)
yield text, tag, pos_tag
files = os.listdir(file_dir)
for fi in files:
with open(file_dir + '/' + fi, "r") as f:
for l in f:
l = l.strip().split(",")
pos_index = int(l[0])
pos_tag = []
pos_tag.append(pos_index)
text_raw = l[1].split()
text = [int(w) for w in text_raw]
for ii in range(tag_size):
tag = []
tag.append(ii)
yield text, tag, pos_tag
return reader
def train(filename, text_idx, tag_idx, neg_size, n, data_type=DataType.SEQ):
return train_reader_creator(filename, text_idx, tag_idx, neg_size, n, data_type)
def train(train_dir, tag_size, neg_size, n, data_type=DataType.SEQ):
return train_reader_creator(train_dir, tag_size, neg_size, n, data_type)
def test(filename, text_idx, tag_idx, neg_size, n, data_type=DataType.SEQ):
return test_reader_creator(filename, text_idx, tag_idx, n, data_type)
def test(test_dir, tag_size, n, data_type=DataType.SEQ):
return test_reader_creator(test_dir, tag_size, n, data_type)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册