未验证 提交 25370c17 编写于 作者: Z zhang wenhui 提交者: GitHub

Merge pull request #1483 from frankwhzhang/multivie

add multiview-simnet infer
......@@ -75,76 +75,51 @@ def train():
exe = fluid.Executor(place)
if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda, loss_name=avg_cost.name)
use_cuda=use_cuda,
loss_name=avg_cost.name)
else:
train_exe = exe
def train_loop(main_program):
""" train network """
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
exe.run(fluid.default_startup_program())
total_time = 0.0
for pass_idx in six.moves.xrange(pass_num):
epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx)
exe.run(fluid.default_startup_program())
total_time = 0.0
for pass_idx in six.moves.xrange(pass_num):
epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx)
t0 = time.time()
i = 0
newest_ppl = 0
for data in train_reader():
i += 1
lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data],
place)
lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data],
place)
ret_avg_cost = train_exe.run(main_program,
feed={ "src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq},
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
if i % args.print_batch == 0:
print("step:%d ppl:%.3f" % (i, newest_ppl))
t0 = time.time()
i = 0
newest_ppl = 0
for data in train_reader():
i += 1
lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data],
place)
lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data],
place)
ret_avg_cost = train_exe.run(feed={
"src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq},
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
if i % args.print_batch == 0:
print("step:%d ppl:%.3f" % (i, newest_ppl))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, i, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost, acc]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("model saved in %s" % save_dir)
#exe.close()
print("finish training")
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, i, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost, acc]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("model saved in %s" % save_dir)
#exe.close()
print("finish training")
if args.is_local:
print("run local training")
train_loop(fluid.default_main_program())
else:
print("run distribute training")
port = os.getenv("PADDLE_PORT", "6174")
pserver_ips = os.getenv("PADDLE_PSERVERS") # ip,ip...
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint,
pserver_prog)
exe.run(pserver_startup)
exe.run(pserver_prog)
elif training_role == "TRAINER":
train_loop(t.get_trainer_program())
if __name__ == "__main__":
train()
......@@ -15,8 +15,13 @@
```bash
python train.py
```
##
如下
如下命令行可以获得预测工具的具体选项,`python infer -h`内容可以参考说明
```bash
python infer.py
```
## 未来的工作
- 多种pairwise的损失函数会被加入到这个项目中。对于不同视角的特征,用户-项目之间的匹配关系可以使用不同的损失函数进行联合优化。整个模型会在真实数据中进行验证。
- 推理工具会被加入
- Parallel Executor选项会被加入
- 分布式训练能力会被加入
......@@ -15,8 +15,13 @@ The command line options for training can be listed by `python train.py -h`
python train.py
```
## Infer
The command line options for inference can be listed by `python infer.py -h`
```bash
python infer.py
```
## Future work
- Multiple types of pairwise loss will be added in this project. For different views of features between a user and an item, multiple losses will be supported. The model will be verified in real world dataset.
- infer will be added
- Parallel Executor will be added in this project
- Distributed Training will be added
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import logging
import paddle.fluid as fluid
import paddle
import time
import reader as reader
from nets import MultiviewSimnet, SimpleEncoderFactory
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
def parse_args():
parser = argparse.ArgumentParser("multi-view simnet")
parser.add_argument(
"--train_file", type=str, help="Training file")
parser.add_argument(
"--valid_file", type=str, help="Validation file")
parser.add_argument(
"--epochs", type=int, default=10, help="Number of epochs for training")
parser.add_argument(
"--model_dir", type=str, default='model_output', help="Model output folder")
parser.add_argument(
"--query_slots", type=int, default=1, help="Number of query slots")
parser.add_argument(
"--title_slots", type=int, default=1, help="Number of title slots")
parser.add_argument(
"--query_encoder", type=str, default="bow", help="Encoder module for slot encoding")
parser.add_argument(
"--title_encoder", type=str, default="bow", help="Encoder module for slot encoding")
parser.add_argument(
"--query_encode_dim", type=int, default=128, help="Dimension of query encoder output")
parser.add_argument(
"--title_encode_dim", type=int, default=128, help="Dimension of title encoder output")
parser.add_argument(
"--batch_size", type=int, default=128, help="Batch size for training")
parser.add_argument(
"--embedding_dim", type=int, default=128, help="Default Dimension of Embedding")
parser.add_argument(
"--sparse_feature_dim", type=int, default=1000001, help="Sparse feature hashing space for index processing")
parser.add_argument(
"--hidden_size", type=int, default=128, help="Hidden dim")
return parser.parse_args()
def start_infer(args, model_path):
dataset = reader.SyntheticDataset(args.sparse_feature_dim, args.query_slots,
args.title_slots)
test_reader = paddle.batch(
paddle.reader.shuffle(
dataset.valid(), buf_size=args.batch_size * 100),
batch_size=args.batch_size)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
with fluid.scope_guard(fluid.core.Scope()):
infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(
args.model_dir, exe)
t0 = time.time()
step_id = 0
feeder = fluid.DataFeeder(program=infer_program, feed_list=feed_target_names, place=place)
for batch_id, data in enumerate(test_reader()):
step_id += 1
loss_val, correct_val = exe.run(infer_program,
feed=feeder.feed(data),
fetch_list=fetch_vars)
logger.info("TRAIN --> pass: {} batch_id: {} avg_cost: {}, acc: {}"
.format(step_id, batch_id, loss_val,
float(correct_val) / args.batch_size))
def main():
args = parse_args()
start_infer(args, args.model_dir)
if __name__ == "__main__":
main()
......@@ -7,17 +7,27 @@
├── README.md # 文档
├── train.py # 训练脚本
├── infer.py # 预测脚本
├── net.py # 网络结构
├── text2paddle.py # 文本数据转paddle数据
├── cluster_train.py # 多机训练
├── cluster_train.sh # 多机训练脚本
├── utils # 通用函数
├── small_train.txt # 小样本训练集
└── small_test.txt # 小样本测试集
├── vocab_text.txt # 小样本文本字典
├── vocab_tag.txt # 小样本类别字典
├── train_data # 小样本训练目录
└── test_data # 小样本测试目录
```
## 简介
TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Hashtags](https://research.fb.com/publications/tagspace-semantic-embeddings-from-hashtags/),在本例中,我们实现了TagSpace的模型。
## 数据下载
TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Hashtags](https://research.fb.com/publications/tagspace-semantic-embeddings-from-hashtags/)
Tagspace模型学习文本及标签的embedding表示,应用于工业级的标签推荐,具体应用场景有feed新闻标签推荐。
## 数据下载及预处理
[ag news dataset](https://github.com/mhjabreel/CharCNN/tree/master/data/ag_news_csv)
......@@ -27,23 +37,54 @@ TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Ha
"3","Wall St. Bears Claw Back Into the Black (Reuters)","Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
```
## 训练
'--use_cuda 1' 表示使用gpu, 缺省表示使用cpu
将文本数据转为paddle数据,先将数据放到训练数据目录和测试数据目录
```
mv train.csv raw_big_train_data
mv test.csv raw_big_test_data
```
运行脚本text2paddle.py 生成paddle输入格式
```
python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test_big_data big_vocab_text.txt big_vocab_tag.txt
```
## 单机训练
'--use_cuda 1' 表示使用gpu, 0表示使用cpu, '--parallel 1' 表示使用多卡
小数据训练(样例中的数据已经准备,可跳过上一节的数据准备,直接运行命令)
GPU 环境
运行命令 `CUDA_VISIBLE_DEVICES=0 python train.py train_file test_file --use_cuda 1` 开始训练模型。
```
CUDA_VISIBLE_DEVICES=0 python train.py small_train.txt small_test.txt --use_cuda 1
CUDA_VISIBLE_DEVICES=0 python train.py --use_cuda 1
```
CPU 环境
运行命令 `python train.py train_file test_file` 开始训练模型。
```
python train.py small_train.txt small_test.txt
python train.py
```
全量数据单机单卡训练
```
CUDA_VISIBLE_DEVICES=0 python train.py --use_cuda 1 --train_dir train_big_data/ --vocab_text_path big_vocab_text.txt --vocab_tag_path big_vocab_tag.txt --model_dir big_model --batch_size 500
```
全量数据单机多卡训练
```
python train.py --train_dir train_big_data/ --vocab_text_path big_vocab_text.txt --vocab_tag_path big_vocab_tag.txt --model_dir big_model --batch_size 500 --parallel 1
```
## 预测
小数据预测
```
python infer.py
```
全量数据预测
```
CUDA_VISIBLE_DEVICES=0 python infer.py model/ 1 10 small_train.txt small_test.txt --use_cuda 1
python infer.py --model_dir big_model --vocab_tag_path big_vocab_tag.txt --test_dir test_big_data/
```
## 本地模拟多机
运行命令
```
sh cluster_train.py
```
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import paddle
import paddle.fluid as fluid
import time
import utils
import net
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("TagSpace benchmark.")
parser.add_argument(
'--neg_size', type=int, default=3, help='neg/pos ratio')
parser.add_argument(
'--train_dir', type=str, default='train_data', help='train file address')
parser.add_argument(
'--vocab_text_path', type=str, default='vocab_text.txt', help='vocab_text file address')
parser.add_argument(
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab_text file address')
parser.add_argument(
'--is_local', type=int, default=1, help='whether local')
parser.add_argument(
'--model_dir', type=str, default='model_', help='model dir')
parser.add_argument(
'--batch_size', type=int, default=5, help='num of batch size')
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument(
'--pass_num', type=int, default=10, help='num of epoch')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether use gpu')
parser.add_argument(
'--base_lr', type=float, default=0.01, help='learning rate')
parser.add_argument(
'--num_devices', type=int, default=1, help='Number of GPU devices')
parser.add_argument(
'--role', type=str, default='pserver', help='trainer or pserver')
parser.add_argument(
'--endpoints', type=str, default='127.0.0.1:6000', help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
parser.add_argument(
'--current_endpoint', type=str, default='127.0.0.1:6000', help='The current_endpoint')
parser.add_argument(
'--trainer_id', type=int, default=0, help='trainer id ,only trainer_id=0 save model')
parser.add_argument(
'--trainers', type=int, default=1, help='The num of trianers, (default: 1)')
args = parser.parse_args()
return args
def get_cards(args):
return args.num_devices
def train():
""" do training """
args = parse_args()
train_dir = args.train_dir
vocab_text_path = args.vocab_text_path
vocab_tag_path = args.vocab_tag_path
use_cuda = True if args.use_cuda else False
batch_size = args.batch_size
neg_size = args.neg_size
vocab_text_size, vocab_tag_size, train_reader = utils.prepare_data(
file_dir=train_dir, vocab_text_path=vocab_text_path,
vocab_tag_path=vocab_tag_path, neg_size=neg_size,
batch_size=batch_size * get_cards(args),
buffer_size=batch_size*100, is_train=True)
""" train network """
# Train program
text, pos_tag, neg_tag, avg_cost, correct, cos_pos = net.network(vocab_text_size, vocab_tag_size, neg_size=neg_size)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.SGD(learning_rate=args.base_lr)
sgd_optimizer.minimize(avg_cost)
def train_loop(main_program):
# Initialize executor
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
exe.run(fluid.default_startup_program())
total_time = 0.0
for pass_idx in range(pass_num):
epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx)
t0 = time.time()
for batch_id, data in enumerate(train_reader()):
lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place)
lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place)
lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place)
loss_val, correct_val = exe.run(
feed={
"text": lod_text_seq,
"pos_tag": lod_pos_tag,
"neg_tag": lod_neg_tag},
fetch_list=[avg_cost.name, correct.name])
if batch_id % args.print_batch == 0:
print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}"
.format(pass_idx, (batch_id+10) * batch_size, np.mean(loss_val),
float(np.sum(correct_val)) / batch_size))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, batch_id, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["text", "pos_tag"]
fetch_vars = [cos_pos]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("finish training")
if args.is_local:
print("run local training")
train_loop(fluid.default_main_program())
else:
print("run distribute training")
t = fluid.DistributeTranspiler()
t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
if args.role == "pserver":
print("run psever")
pserver_prog = t.get_pserver_program(args.current_endpoint)
pserver_startup = t.get_startup_program(args.current_endpoint,
pserver_prog)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(pserver_startup)
exe.run(pserver_prog)
elif args.role == "trainer":
print("run trainer")
train_loop(t.get_trainer_program())
if __name__ == "__main__":
train()
#!/bin/bash
#export GLOG_v=30
#export GLOG_logtostderr=1
# start pserver0
python cluster_train.py \
--train_dir train_data \
--model_dir cluster_model \
--batch_size 5 \
--is_local 0 \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6000 \
--trainers 2 \
> pserver0.log 2>&1 &
# start pserver1
python cluster_train.py \
--train_dir train_data \
--model_dir cluster_model \
--batch_size 5 \
--is_local 0 \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6001 \
--trainers 2 \
> pserver1.log 2>&1 &
# start trainer0
#CUDA_VISIBLE_DEVICES=1 python cluster_train.py \
python cluster_train.py \
--train_dir train_data \
--model_dir cluster_model \
--batch_size 5 \
--print_batch 10 \
--use_cuda 0 \
--is_local 0 \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 0 \
> trainer0.log 2>&1 &
# start trainer1
#CUDA_VISIBLE_DEVICES=2 python cluster_train.py \
python cluster_train.py \
--train_dir train_data \
--model_dir cluster_model \
--batch_size 5 \
--print_batch 10 \
--use_cuda 0 \
--is_local 0 \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 1 \
> trainer1.log 2>&1 &
import sys
import argparse
import time
import math
import unittest
......@@ -9,6 +10,25 @@ import paddle.fluid as fluid
import paddle
import utils
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument(
'--test_dir', type=str, default='test_data', help='test file address')
parser.add_argument(
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab path')
parser.add_argument(
'--start_index', type=int, default='1', help='start index')
parser.add_argument(
'--last_index', type=int, default='10', help='end index')
parser.add_argument(
'--model_dir', type=str, default='model_', help='model dir')
parser.add_argument(
'--use_cuda', type=int, default='0', help='whether use cuda')
parser.add_argument(
'--batch_size', type=int, default='5', help='batch_size')
args = parser.parse_args()
return args
def infer(test_reader, vocab_tag, use_cuda, model_path):
""" inference function """
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......@@ -21,7 +41,7 @@ def infer(test_reader, vocab_tag, use_cuda, model_path):
step_id = 0
true_num = 0
all_num = 0
size = len(vocab_tag)
size = vocab_tag
value = []
for data in test_reader():
step_id += 1
......@@ -47,30 +67,18 @@ def infer(test_reader, vocab_tag, use_cuda, model_path):
t1 = time.time()
if __name__ == "__main__":
if len(sys.argv) != 6:
print(
"Usage: %s model_dir start_epoch last_epoch(inclusive) train_file test_file"
)
exit(0)
train_file = ""
test_file = ""
model_dir = sys.argv[1]
try:
start_index = int(sys.argv[2])
last_index = int(sys.argv[3])
train_file = sys.argv[4]
test_file = sys.argv[5]
except:
print(
"Usage: %s model_dir start_ipoch last_epoch(inclusive) train_file test_file"
)
exit(-1)
vocab_text, vocab_tag, train_reader, test_reader = utils.prepare_data(
train_file,
test_file,
batch_size=1,
buffer_size=1000,
word_freq_threshold=0)
args = parse_args()
start_index = args.start_index
last_index = args.last_index
test_dir = args.test_dir
model_dir = args.model_dir
batch_size = args.batch_size
vocab_tag_path = args.vocab_tag_path
use_cuda = True if args.use_cuda else False
print("start index: ", start_index, " last_index:" ,last_index)
vocab_text, vocab_tag, test_reader = utils.prepare_data(
test_dir, "", vocab_tag_path, batch_size=1,
neg_size=0, buffer_size=1000, is_train=False)
for epoch in xrange(start_index, last_index + 1):
epoch_path = model_dir + "/epoch_" + str(epoch)
......
import paddle.fluid as fluid
import paddle.fluid.layers.nn as nn
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
import paddle.fluid.layers.io as io
def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1, neg_size=5):
""" network definition """
text = io.data(name="text", shape=[1], lod_level=1, dtype='int64')
pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64')
neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64')
text_emb = nn.embedding(
input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb")
pos_tag_emb = nn.embedding(
input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
neg_tag_emb = nn.embedding(
input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
conv_1d = fluid.nets.sequence_conv_pool(
input=text_emb,
num_filters=hid_dim,
filter_size=win_size,
act="tanh",
pool_type="max",
param_attr="cnn")
text_hid = fluid.layers.fc(input=conv_1d, size=emb_dim, param_attr="text_hid")
cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb)
mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid)
cos_neg_all = fluid.layers.sequence_reshape(input=mul_cos_neg, new_dim=neg_size)
#choose max negtive cosine
cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True)
#calculate hinge loss
loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=cos_pos,
shape=[-1, 1],
value=margin,
dtype='float32'),
cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max(
tensor.fill_constant_batch_size_like(
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
avg_cost = nn.mean(loss_part3)
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less)
return text, pos_tag, neg_tag, avg_cost, correct, cos_pos
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
此差异已折叠。
import sys
import six
import collections
import os
import csv
import re
def word_count(column_num, input_file, word_freq=None):
"""
compute word count from corpus
"""
if word_freq is None:
word_freq = collections.defaultdict(int)
data_file = csv.reader(input_file)
for row in data_file:
for w in re.split(r'\W+',row[column_num].strip()):
word_freq[w]+= 1
return word_freq
def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
"""
Build a word dictionary from the corpus, Keys of the dictionary are words,
and values are zero-based IDs of these words.
"""
word_freq = collections.defaultdict(int)
files = os.listdir(train_dir)
for fi in files:
with open(train_dir + '/' + fi, "r") as f:
word_freq = word_count(column_num, f, word_freq)
files = os.listdir(test_dir)
for fi in files:
with open(test_dir + '/' + fi, "r") as f:
word_freq = word_count(column_num, f, word_freq)
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(list(zip(words, six.moves.range(len(words)))))
return word_idx
def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, output_test_dir):
files = os.listdir(train_dir)
if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir)
for fi in files:
with open(train_dir + '/' + fi, "r") as f:
with open(output_train_dir + '/' + fi, "w") as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
wf.write(str(pos_index) + ",")
text_raw = re.split(r'\W+', row[2].strip())
l = [text_idx.get(w) for w in text_raw]
for w in l:
wf.write(str(w) + " ")
wf.write("\n")
files = os.listdir(test_dir)
if not os.path.exists(output_test_dir):
os.mkdir(output_test_dir)
for fi in files:
with open(test_dir + '/' + fi, "r") as f:
with open(output_test_dir + '/' + fi, "w") as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
wf.write(str(pos_index) + ",")
text_raw = re.split(r'\W+', row[2].strip())
l = [text_idx.get(w) for w in text_raw]
for w in l:
wf.write(str(w) + " ")
wf.write("\n")
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag):
print("start constuct word dict")
vocab_text = build_dict(2, 0, train_dir, test_dir)
with open(output_vocab_text, "w") as wf:
wf.write(str(len(vocab_text)) + "\n")
vocab_tag = build_dict(0, 0, train_dir, test_dir)
with open(output_vocab_tag, "w") as wf:
wf.write(str(len(vocab_tag)) + "\n")
print("construct word dict done\n")
write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir, output_test_dir)
train_dir = sys.argv[1]
test_dir = sys.argv[2]
output_train_dir = sys.argv[3]
output_test_dir = sys.argv[4]
output_vocab_text = sys.argv[5]
output_vocab_tag = sys.argv[6]
text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag)
......@@ -7,90 +7,83 @@ import math
import argparse
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers.nn as nn
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
import paddle.fluid.layers.io as io
import time
import utils
import net
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("TagSpace benchmark.")
parser.add_argument('train_file')
parser.add_argument('test_file')
parser.add_argument('--use_cuda', help='whether use gpu')
parser.add_argument(
'--neg_size', type=int, default=3, help='neg/pos ratio')
parser.add_argument(
'--train_dir', type=str, default='train_data', help='train file address')
parser.add_argument(
'--vocab_text_path', type=str, default='vocab_text.txt', help='vocab_text file address')
parser.add_argument(
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab_text file address')
parser.add_argument(
'--model_dir', type=str, default='model_', help='model dir')
parser.add_argument(
'--batch_size', type=int, default=5, help='num of batch size')
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument(
'--pass_num', type=int, default=10, help='num of epoch')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether use gpu')
parser.add_argument(
'--parallel', type=int, default=0, help='whether parallel')
parser.add_argument(
'--base_lr', type=float, default=0.01, help='learning rate')
parser.add_argument(
'--num_devices', type=int, default=1, help='Number of GPU devices')
args = parser.parse_args()
return args
def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1, neg_size=5):
""" network definition """
text = io.data(name="text", shape=[1], lod_level=1, dtype='int64')
pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64')
neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64')
text_emb = nn.embedding(
input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb")
pos_tag_emb = nn.embedding(
input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
neg_tag_emb = nn.embedding(
input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
conv_1d = fluid.nets.sequence_conv_pool(
input=text_emb,
num_filters=hid_dim,
filter_size=win_size,
act="tanh",
pool_type="max",
param_attr="cnn")
text_hid = fluid.layers.fc(input=conv_1d, size=emb_dim, param_attr="text_hid")
cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb)
mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid)
cos_neg_all = fluid.layers.sequence_reshape(input=mul_cos_neg, new_dim=neg_size)
#choose max negtive cosine
cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True)
#calculate hinge loss
loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=cos_pos,
shape=[-1, 1],
value=margin,
dtype='float32'),
cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max(
tensor.fill_constant_batch_size_like(
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
avg_cost = nn.mean(loss_part3)
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less)
return text, pos_tag, neg_tag, avg_cost, correct, cos_pos
def get_cards(args):
return args.num_devices
def train(train_reader, vocab_text, vocab_tag, base_lr, batch_size, neg_size,
pass_num, use_cuda, model_dir):
""" train network """
def train():
""" do training """
args = parse_args()
vocab_text_size = len(vocab_text)
vocab_tag_size = len(vocab_tag)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
train_dir = args.train_dir
vocab_text_path = args.vocab_text_path
vocab_tag_path = args.vocab_tag_path
use_cuda = True if args.use_cuda else False
parallel = True if args.parallel else False
batch_size = args.batch_size
neg_size = args.neg_size
print("use_cuda: {}, parallel: {}, batch_size: {}, neg_size: {} "
.format(use_cuda, parallel, batch_size, neg_size))
vocab_text_size, vocab_tag_size, train_reader = utils.prepare_data(
file_dir=train_dir, vocab_text_path=vocab_text_path,
vocab_tag_path=vocab_tag_path, neg_size=neg_size,
batch_size=batch_size * get_cards(args),
buffer_size=batch_size*100, is_train=True)
""" train network """
# Train program
text, pos_tag, neg_tag, avg_cost, correct, cos_pos = network(vocab_text_size, vocab_tag_size, neg_size=neg_size)
text, pos_tag, neg_tag, avg_cost, correct, cos_pos = net.network(vocab_text_size, vocab_tag_size, neg_size=neg_size)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=base_lr)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
sgd_optimizer.minimize(avg_cost)
# Initialize executor
startup_program = fluid.default_startup_program()
loop_program = fluid.default_main_program()
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_program)
if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
loss_name=avg_cost.name)
else:
train_exe = exe
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
exe.run(fluid.default_startup_program())
total_time = 0.0
for pass_idx in range(pass_num):
epoch_idx = pass_idx + 1
......@@ -100,17 +93,16 @@ def train(train_reader, vocab_text, vocab_tag, base_lr, batch_size, neg_size,
lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place)
lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place)
lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place)
loss_val, correct_val = exe.run(
loop_program,
loss_val, correct_val = train_exe.run(
feed={
"text": lod_text_seq,
"pos_tag": lod_pos_tag,
"neg_tag": lod_neg_tag},
fetch_list=[avg_cost, correct])
if batch_id % 10 == 0:
print("TRAIN --> pass: {} batch_id: {} avg_cost: {}, acc: {}"
.format(pass_idx, batch_id, loss_val,
float(correct_val) / batch_size))
fetch_list=[avg_cost.name, correct.name])
if batch_id % args.print_batch == 0:
print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}"
.format(pass_idx, (batch_id+10) * batch_size, np.mean(loss_val),
float(np.sum(correct_val)) / batch_size))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
......@@ -121,27 +113,5 @@ def train(train_reader, vocab_text, vocab_tag, base_lr, batch_size, neg_size,
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("finish training")
def train_net():
""" do training """
args = parse_args()
train_file = args.train_file
test_file = args.test_file
use_cuda = True if args.use_cuda else False
batch_size = 100
neg_size = 3
vocab_text, vocab_tag, train_reader, test_reader = utils.prepare_data(
train_file, test_file, neg_size=neg_size, batch_size=batch_size, buffer_size=batch_size*100, word_freq_threshold=0)
train(
train_reader=train_reader,
vocab_text=vocab_text,
vocab_tag=vocab_tag,
base_lr=0.01,
batch_size=batch_size,
neg_size=neg_size,
pass_num=10,
use_cuda=use_cuda,
model_dir="model")
if __name__ == "__main__":
train_net()
train()
此差异已折叠。
import re
import sys
import collections
import os
import six
import time
import numpy as np
......@@ -23,30 +24,39 @@ def to_lodtensor(data, place):
res.set_lod([lod])
return res
def prepare_data(train_filename,
test_filename,
def get_vocab_size(vocab_path):
with open(vocab_path, "r") as rf:
line = rf.readline()
return int(line.strip())
def prepare_data(file_dir,
vocab_text_path,
vocab_tag_path,
batch_size,
neg_size=1,
buffer_size=1000,
word_freq_threshold=0,
enable_ce=False):
neg_size,
buffer_size,
is_train=True):
""" prepare the AG's News Topic Classification data """
print("start constuct word dict")
vocab_text = build_dict(2, word_freq_threshold, train_filename, test_filename)
vocab_tag = build_dict(0, word_freq_threshold, train_filename, test_filename)
print("construct word dict done\n")
train_reader = sort_batch(
paddle.reader.shuffle(
train(
train_filename, vocab_text, vocab_tag, neg_size,
buffer_size, data_type=DataType.SEQ),
buf_size=buffer_size),
batch_size, batch_size * 20)
test_reader = sort_batch(
test(
test_filename, vocab_text, vocab_tag, neg_size, buffer_size, data_type=DataType.SEQ),
batch_size, batch_size * 20)
return vocab_text, vocab_tag, train_reader, test_reader
print("start read file")
if is_train:
vocab_text_size = get_vocab_size(vocab_text_path)
vocab_tag_size = get_vocab_size(vocab_tag_path)
reader = sort_batch(
paddle.reader.shuffle(
train(
file_dir, vocab_tag_size, neg_size,
buffer_size, data_type=DataType.SEQ),
buf_size=buffer_size),
batch_size, batch_size * 20)
else:
vocab_tag_size = get_vocab_size(vocab_tag_path)
vocab_text_size = 0
reader = sort_batch(
test(
file_dir, vocab_tag_size, buffer_size, data_type=DataType.SEQ),
batch_size, batch_size * 20)
return vocab_text_size, vocab_tag_size, reader
def sort_batch(reader, batch_size, sort_group_size, drop_last=False):
"""
......@@ -97,82 +107,57 @@ def sort_batch(reader, batch_size, sort_group_size, drop_last=False):
class DataType(object):
SEQ = 2
def word_count(column_num, input_file, word_freq=None):
"""
compute word count from corpus
"""
if word_freq is None:
word_freq = collections.defaultdict(int)
data_file = csv.reader(input_file)
for row in data_file:
for w in re.split(r'\W+',row[column_num].strip()):
word_freq[w]+= 1
return word_freq
def build_dict(column_num=2, min_word_freq=50, train_filename="", test_filename=""):
"""
Build a word dictionary from the corpus, Keys of the dictionary are words,
and values are zero-based IDs of these words.
"""
with open(train_filename) as trainf:
with open(test_filename) as testf:
word_freq = word_count(column_num, testf, word_count(column_num, trainf))
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(list(zip(words, six.moves.range(len(words)))))
return word_idx
def train_reader_creator(filename, text_idx, tag_idx, neg_size, n, data_type):
def train_reader_creator(file_dir, tag_size, neg_size, n, data_type):
def reader():
with open(filename) as input_file:
data_file = csv.reader(input_file)
for row in data_file:
text_raw = re.split(r'\W+', row[2].strip())
text = [text_idx.get(w) for w in text_raw]
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
pos_tag=[]
pos_tag.append(pos_index)
neg_tag=[]
max_iter = 100
now_iter = 0
sum_n = 0
while(sum_n < neg_size) :
now_iter += 1
if now_iter > max_iter:
print("error : only one class")
sys.exit(0)
rand_i = np.random.randint(0, len(tag_idx))
if rand_i != pos_index:
neg_index=rand_i
neg_tag.append(neg_index)
sum_n += 1
if n > 0 and len(text) > n: continue
yield text, pos_tag, neg_tag
files = os.listdir(file_dir)
for fi in files:
with open(file_dir + '/' + fi, "r") as f:
for l in f:
l = l.strip().split(",")
pos_index = int(l[0])
pos_tag = []
pos_tag.append(pos_index)
text_raw = l[1].split()
text = [int(w) for w in text_raw]
neg_tag = []
max_iter = 100
now_iter = 0
sum_n = 0
while(sum_n < neg_size) :
now_iter += 1
if now_iter > max_iter:
print("error : only one class")
sys.exit(0)
rand_i = np.random.randint(0, tag_size)
if rand_i != pos_index:
neg_index = rand_i
neg_tag.append(neg_index)
sum_n += 1
if n > 0 and len(text) > n: continue
yield text, pos_tag, neg_tag
return reader
def test_reader_creator(filename, text_idx, tag_idx, n, data_type):
def test_reader_creator(file_dir, tag_size, n, data_type):
def reader():
with open(filename) as input_file:
data_file = csv.reader(input_file)
for row in data_file:
text_raw = re.split(r'\W+', row[2].strip())
text = [text_idx.get(w) for w in text_raw]
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
pos_tag = []
pos_tag.append(pos_index)
for ii in range(len(tag_idx)):
tag = []
tag.append(ii)
yield text, tag, pos_tag
files = os.listdir(file_dir)
for fi in files:
with open(file_dir + '/' + fi, "r") as f:
for l in f:
l = l.strip().split(",")
pos_index = int(l[0])
pos_tag = []
pos_tag.append(pos_index)
text_raw = l[1].split()
text = [int(w) for w in text_raw]
for ii in range(tag_size):
tag = []
tag.append(ii)
yield text, tag, pos_tag
return reader
def train(filename, text_idx, tag_idx, neg_size, n, data_type=DataType.SEQ):
return train_reader_creator(filename, text_idx, tag_idx, neg_size, n, data_type)
def train(train_dir, tag_size, neg_size, n, data_type=DataType.SEQ):
return train_reader_creator(train_dir, tag_size, neg_size, n, data_type)
def test(filename, text_idx, tag_idx, neg_size, n, data_type=DataType.SEQ):
return test_reader_creator(filename, text_idx, tag_idx, n, data_type)
def test(test_dir, tag_size, n, data_type=DataType.SEQ):
return test_reader_creator(test_dir, tag_size, n, data_type)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册