未验证 提交 25370c17 编写于 作者: Z zhang wenhui 提交者: GitHub

Merge pull request #1483 from frankwhzhang/multivie

add multiview-simnet infer
...@@ -75,76 +75,51 @@ def train(): ...@@ -75,76 +75,51 @@ def train():
exe = fluid.Executor(place) exe = fluid.Executor(place)
if parallel: if parallel:
train_exe = fluid.ParallelExecutor( train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda, loss_name=avg_cost.name) use_cuda=use_cuda,
loss_name=avg_cost.name)
else: else:
train_exe = exe train_exe = exe
def train_loop(main_program): pass_num = args.pass_num
""" train network """ model_dir = args.model_dir
pass_num = args.pass_num fetch_list = [avg_cost.name]
model_dir = args.model_dir
fetch_list = [avg_cost.name]
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
total_time = 0.0 total_time = 0.0
for pass_idx in six.moves.xrange(pass_num): for pass_idx in six.moves.xrange(pass_num):
epoch_idx = pass_idx + 1 epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx) print("epoch_%d start" % epoch_idx)
t0 = time.time() t0 = time.time()
i = 0 i = 0
newest_ppl = 0 newest_ppl = 0
for data in train_reader(): for data in train_reader():
i += 1 i += 1
lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data], lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data],
place) place)
lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data], lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data],
place) place)
ret_avg_cost = train_exe.run(main_program, ret_avg_cost = train_exe.run(feed={
feed={ "src_wordseq": lod_src_wordseq, "src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq}, "dst_wordseq": lod_dst_wordseq},
fetch_list=fetch_list) fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0]) avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl) newest_ppl = np.mean(avg_ppl)
if i % args.print_batch == 0: if i % args.print_batch == 0:
print("step:%d ppl:%.3f" % (i, newest_ppl)) print("step:%d ppl:%.3f" % (i, newest_ppl))
t1 = time.time() t1 = time.time()
total_time += t1 - t0 total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" % print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, i, total_time / epoch_idx)) (epoch_idx, i, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"] feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost, acc] fetch_vars = [avg_cost, acc]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("model saved in %s" % save_dir) print("model saved in %s" % save_dir)
#exe.close() #exe.close()
print("finish training") print("finish training")
if args.is_local:
print("run local training")
train_loop(fluid.default_main_program())
else:
print("run distribute training")
port = os.getenv("PADDLE_PORT", "6174")
pserver_ips = os.getenv("PADDLE_PSERVERS") # ip,ip...
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint,
pserver_prog)
exe.run(pserver_startup)
exe.run(pserver_prog)
elif training_role == "TRAINER":
train_loop(t.get_trainer_program())
if __name__ == "__main__": if __name__ == "__main__":
train() train()
...@@ -15,8 +15,13 @@ ...@@ -15,8 +15,13 @@
```bash ```bash
python train.py python train.py
``` ```
##
如下
如下命令行可以获得预测工具的具体选项,`python infer -h`内容可以参考说明
```bash
python infer.py
```
## 未来的工作 ## 未来的工作
- 多种pairwise的损失函数会被加入到这个项目中。对于不同视角的特征,用户-项目之间的匹配关系可以使用不同的损失函数进行联合优化。整个模型会在真实数据中进行验证。 - 多种pairwise的损失函数会被加入到这个项目中。对于不同视角的特征,用户-项目之间的匹配关系可以使用不同的损失函数进行联合优化。整个模型会在真实数据中进行验证。
- 推理工具会被加入
- Parallel Executor选项会被加入 - Parallel Executor选项会被加入
- 分布式训练能力会被加入 - 分布式训练能力会被加入
...@@ -15,8 +15,13 @@ The command line options for training can be listed by `python train.py -h` ...@@ -15,8 +15,13 @@ The command line options for training can be listed by `python train.py -h`
python train.py python train.py
``` ```
## Infer
The command line options for inference can be listed by `python infer.py -h`
```bash
python infer.py
```
## Future work ## Future work
- Multiple types of pairwise loss will be added in this project. For different views of features between a user and an item, multiple losses will be supported. The model will be verified in real world dataset. - Multiple types of pairwise loss will be added in this project. For different views of features between a user and an item, multiple losses will be supported. The model will be verified in real world dataset.
- infer will be added
- Parallel Executor will be added in this project - Parallel Executor will be added in this project
- Distributed Training will be added - Distributed Training will be added
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import logging
import paddle.fluid as fluid
import paddle
import time
import reader as reader
from nets import MultiviewSimnet, SimpleEncoderFactory
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
def parse_args():
parser = argparse.ArgumentParser("multi-view simnet")
parser.add_argument(
"--train_file", type=str, help="Training file")
parser.add_argument(
"--valid_file", type=str, help="Validation file")
parser.add_argument(
"--epochs", type=int, default=10, help="Number of epochs for training")
parser.add_argument(
"--model_dir", type=str, default='model_output', help="Model output folder")
parser.add_argument(
"--query_slots", type=int, default=1, help="Number of query slots")
parser.add_argument(
"--title_slots", type=int, default=1, help="Number of title slots")
parser.add_argument(
"--query_encoder", type=str, default="bow", help="Encoder module for slot encoding")
parser.add_argument(
"--title_encoder", type=str, default="bow", help="Encoder module for slot encoding")
parser.add_argument(
"--query_encode_dim", type=int, default=128, help="Dimension of query encoder output")
parser.add_argument(
"--title_encode_dim", type=int, default=128, help="Dimension of title encoder output")
parser.add_argument(
"--batch_size", type=int, default=128, help="Batch size for training")
parser.add_argument(
"--embedding_dim", type=int, default=128, help="Default Dimension of Embedding")
parser.add_argument(
"--sparse_feature_dim", type=int, default=1000001, help="Sparse feature hashing space for index processing")
parser.add_argument(
"--hidden_size", type=int, default=128, help="Hidden dim")
return parser.parse_args()
def start_infer(args, model_path):
dataset = reader.SyntheticDataset(args.sparse_feature_dim, args.query_slots,
args.title_slots)
test_reader = paddle.batch(
paddle.reader.shuffle(
dataset.valid(), buf_size=args.batch_size * 100),
batch_size=args.batch_size)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
with fluid.scope_guard(fluid.core.Scope()):
infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(
args.model_dir, exe)
t0 = time.time()
step_id = 0
feeder = fluid.DataFeeder(program=infer_program, feed_list=feed_target_names, place=place)
for batch_id, data in enumerate(test_reader()):
step_id += 1
loss_val, correct_val = exe.run(infer_program,
feed=feeder.feed(data),
fetch_list=fetch_vars)
logger.info("TRAIN --> pass: {} batch_id: {} avg_cost: {}, acc: {}"
.format(step_id, batch_id, loss_val,
float(correct_val) / args.batch_size))
def main():
args = parse_args()
start_infer(args, args.model_dir)
if __name__ == "__main__":
main()
...@@ -7,17 +7,27 @@ ...@@ -7,17 +7,27 @@
├── README.md # 文档 ├── README.md # 文档
├── train.py # 训练脚本 ├── train.py # 训练脚本
├── infer.py # 预测脚本 ├── infer.py # 预测脚本
├── net.py # 网络结构
├── text2paddle.py # 文本数据转paddle数据
├── cluster_train.py # 多机训练
├── cluster_train.sh # 多机训练脚本
├── utils # 通用函数 ├── utils # 通用函数
├── small_train.txt # 小样本训练集 ├── vocab_text.txt # 小样本文本字典
└── small_test.txt # 小样本测试集 ├── vocab_tag.txt # 小样本类别字典
├── train_data # 小样本训练目录
└── test_data # 小样本测试目录
``` ```
## 简介 ## 简介
TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Hashtags](https://research.fb.com/publications/tagspace-semantic-embeddings-from-hashtags/),在本例中,我们实现了TagSpace的模型。 TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Hashtags](https://research.fb.com/publications/tagspace-semantic-embeddings-from-hashtags/)
## 数据下载
Tagspace模型学习文本及标签的embedding表示,应用于工业级的标签推荐,具体应用场景有feed新闻标签推荐。
## 数据下载及预处理
[ag news dataset](https://github.com/mhjabreel/CharCNN/tree/master/data/ag_news_csv) [ag news dataset](https://github.com/mhjabreel/CharCNN/tree/master/data/ag_news_csv)
...@@ -27,23 +37,54 @@ TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Ha ...@@ -27,23 +37,54 @@ TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Ha
"3","Wall St. Bears Claw Back Into the Black (Reuters)","Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again." "3","Wall St. Bears Claw Back Into the Black (Reuters)","Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
``` ```
## 训练 将文本数据转为paddle数据,先将数据放到训练数据目录和测试数据目录
'--use_cuda 1' 表示使用gpu, 缺省表示使用cpu ```
mv train.csv raw_big_train_data
mv test.csv raw_big_test_data
```
运行脚本text2paddle.py 生成paddle输入格式
```
python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test_big_data big_vocab_text.txt big_vocab_tag.txt
```
## 单机训练
'--use_cuda 1' 表示使用gpu, 0表示使用cpu, '--parallel 1' 表示使用多卡
小数据训练(样例中的数据已经准备,可跳过上一节的数据准备,直接运行命令)
GPU 环境 GPU 环境
运行命令 `CUDA_VISIBLE_DEVICES=0 python train.py train_file test_file --use_cuda 1` 开始训练模型。
``` ```
CUDA_VISIBLE_DEVICES=0 python train.py small_train.txt small_test.txt --use_cuda 1 CUDA_VISIBLE_DEVICES=0 python train.py --use_cuda 1
``` ```
CPU 环境 CPU 环境
运行命令 `python train.py train_file test_file` 开始训练模型。
``` ```
python train.py small_train.txt small_test.txt python train.py
```
全量数据单机单卡训练
```
CUDA_VISIBLE_DEVICES=0 python train.py --use_cuda 1 --train_dir train_big_data/ --vocab_text_path big_vocab_text.txt --vocab_tag_path big_vocab_tag.txt --model_dir big_model --batch_size 500
```
全量数据单机多卡训练
```
python train.py --train_dir train_big_data/ --vocab_text_path big_vocab_text.txt --vocab_tag_path big_vocab_tag.txt --model_dir big_model --batch_size 500 --parallel 1
``` ```
## 预测 ## 预测
小数据预测
```
python infer.py
```
全量数据预测
``` ```
CUDA_VISIBLE_DEVICES=0 python infer.py model/ 1 10 small_train.txt small_test.txt --use_cuda 1 python infer.py --model_dir big_model --vocab_tag_path big_vocab_tag.txt --test_dir test_big_data/
``` ```
## 本地模拟多机
运行命令
```
sh cluster_train.py
```
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import paddle
import paddle.fluid as fluid
import time
import utils
import net
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("TagSpace benchmark.")
parser.add_argument(
'--neg_size', type=int, default=3, help='neg/pos ratio')
parser.add_argument(
'--train_dir', type=str, default='train_data', help='train file address')
parser.add_argument(
'--vocab_text_path', type=str, default='vocab_text.txt', help='vocab_text file address')
parser.add_argument(
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab_text file address')
parser.add_argument(
'--is_local', type=int, default=1, help='whether local')
parser.add_argument(
'--model_dir', type=str, default='model_', help='model dir')
parser.add_argument(
'--batch_size', type=int, default=5, help='num of batch size')
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument(
'--pass_num', type=int, default=10, help='num of epoch')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether use gpu')
parser.add_argument(
'--base_lr', type=float, default=0.01, help='learning rate')
parser.add_argument(
'--num_devices', type=int, default=1, help='Number of GPU devices')
parser.add_argument(
'--role', type=str, default='pserver', help='trainer or pserver')
parser.add_argument(
'--endpoints', type=str, default='127.0.0.1:6000', help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
parser.add_argument(
'--current_endpoint', type=str, default='127.0.0.1:6000', help='The current_endpoint')
parser.add_argument(
'--trainer_id', type=int, default=0, help='trainer id ,only trainer_id=0 save model')
parser.add_argument(
'--trainers', type=int, default=1, help='The num of trianers, (default: 1)')
args = parser.parse_args()
return args
def get_cards(args):
return args.num_devices
def train():
""" do training """
args = parse_args()
train_dir = args.train_dir
vocab_text_path = args.vocab_text_path
vocab_tag_path = args.vocab_tag_path
use_cuda = True if args.use_cuda else False
batch_size = args.batch_size
neg_size = args.neg_size
vocab_text_size, vocab_tag_size, train_reader = utils.prepare_data(
file_dir=train_dir, vocab_text_path=vocab_text_path,
vocab_tag_path=vocab_tag_path, neg_size=neg_size,
batch_size=batch_size * get_cards(args),
buffer_size=batch_size*100, is_train=True)
""" train network """
# Train program
text, pos_tag, neg_tag, avg_cost, correct, cos_pos = net.network(vocab_text_size, vocab_tag_size, neg_size=neg_size)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.SGD(learning_rate=args.base_lr)
sgd_optimizer.minimize(avg_cost)
def train_loop(main_program):
# Initialize executor
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
exe.run(fluid.default_startup_program())
total_time = 0.0
for pass_idx in range(pass_num):
epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx)
t0 = time.time()
for batch_id, data in enumerate(train_reader()):
lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place)
lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place)
lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place)
loss_val, correct_val = exe.run(
feed={
"text": lod_text_seq,
"pos_tag": lod_pos_tag,
"neg_tag": lod_neg_tag},
fetch_list=[avg_cost.name, correct.name])
if batch_id % args.print_batch == 0:
print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}"
.format(pass_idx, (batch_id+10) * batch_size, np.mean(loss_val),
float(np.sum(correct_val)) / batch_size))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, batch_id, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["text", "pos_tag"]
fetch_vars = [cos_pos]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("finish training")
if args.is_local:
print("run local training")
train_loop(fluid.default_main_program())
else:
print("run distribute training")
t = fluid.DistributeTranspiler()
t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
if args.role == "pserver":
print("run psever")
pserver_prog = t.get_pserver_program(args.current_endpoint)
pserver_startup = t.get_startup_program(args.current_endpoint,
pserver_prog)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(pserver_startup)
exe.run(pserver_prog)
elif args.role == "trainer":
print("run trainer")
train_loop(t.get_trainer_program())
if __name__ == "__main__":
train()
#!/bin/bash
#export GLOG_v=30
#export GLOG_logtostderr=1
# start pserver0
python cluster_train.py \
--train_dir train_data \
--model_dir cluster_model \
--batch_size 5 \
--is_local 0 \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6000 \
--trainers 2 \
> pserver0.log 2>&1 &
# start pserver1
python cluster_train.py \
--train_dir train_data \
--model_dir cluster_model \
--batch_size 5 \
--is_local 0 \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6001 \
--trainers 2 \
> pserver1.log 2>&1 &
# start trainer0
#CUDA_VISIBLE_DEVICES=1 python cluster_train.py \
python cluster_train.py \
--train_dir train_data \
--model_dir cluster_model \
--batch_size 5 \
--print_batch 10 \
--use_cuda 0 \
--is_local 0 \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 0 \
> trainer0.log 2>&1 &
# start trainer1
#CUDA_VISIBLE_DEVICES=2 python cluster_train.py \
python cluster_train.py \
--train_dir train_data \
--model_dir cluster_model \
--batch_size 5 \
--print_batch 10 \
--use_cuda 0 \
--is_local 0 \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 1 \
> trainer1.log 2>&1 &
import sys import sys
import argparse
import time import time
import math import math
import unittest import unittest
...@@ -9,6 +10,25 @@ import paddle.fluid as fluid ...@@ -9,6 +10,25 @@ import paddle.fluid as fluid
import paddle import paddle
import utils import utils
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument(
'--test_dir', type=str, default='test_data', help='test file address')
parser.add_argument(
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab path')
parser.add_argument(
'--start_index', type=int, default='1', help='start index')
parser.add_argument(
'--last_index', type=int, default='10', help='end index')
parser.add_argument(
'--model_dir', type=str, default='model_', help='model dir')
parser.add_argument(
'--use_cuda', type=int, default='0', help='whether use cuda')
parser.add_argument(
'--batch_size', type=int, default='5', help='batch_size')
args = parser.parse_args()
return args
def infer(test_reader, vocab_tag, use_cuda, model_path): def infer(test_reader, vocab_tag, use_cuda, model_path):
""" inference function """ """ inference function """
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
...@@ -21,7 +41,7 @@ def infer(test_reader, vocab_tag, use_cuda, model_path): ...@@ -21,7 +41,7 @@ def infer(test_reader, vocab_tag, use_cuda, model_path):
step_id = 0 step_id = 0
true_num = 0 true_num = 0
all_num = 0 all_num = 0
size = len(vocab_tag) size = vocab_tag
value = [] value = []
for data in test_reader(): for data in test_reader():
step_id += 1 step_id += 1
...@@ -47,30 +67,18 @@ def infer(test_reader, vocab_tag, use_cuda, model_path): ...@@ -47,30 +67,18 @@ def infer(test_reader, vocab_tag, use_cuda, model_path):
t1 = time.time() t1 = time.time()
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) != 6: args = parse_args()
print( start_index = args.start_index
"Usage: %s model_dir start_epoch last_epoch(inclusive) train_file test_file" last_index = args.last_index
) test_dir = args.test_dir
exit(0) model_dir = args.model_dir
train_file = "" batch_size = args.batch_size
test_file = "" vocab_tag_path = args.vocab_tag_path
model_dir = sys.argv[1] use_cuda = True if args.use_cuda else False
try: print("start index: ", start_index, " last_index:" ,last_index)
start_index = int(sys.argv[2]) vocab_text, vocab_tag, test_reader = utils.prepare_data(
last_index = int(sys.argv[3]) test_dir, "", vocab_tag_path, batch_size=1,
train_file = sys.argv[4] neg_size=0, buffer_size=1000, is_train=False)
test_file = sys.argv[5]
except:
print(
"Usage: %s model_dir start_ipoch last_epoch(inclusive) train_file test_file"
)
exit(-1)
vocab_text, vocab_tag, train_reader, test_reader = utils.prepare_data(
train_file,
test_file,
batch_size=1,
buffer_size=1000,
word_freq_threshold=0)
for epoch in xrange(start_index, last_index + 1): for epoch in xrange(start_index, last_index + 1):
epoch_path = model_dir + "/epoch_" + str(epoch) epoch_path = model_dir + "/epoch_" + str(epoch)
......
import paddle.fluid as fluid
import paddle.fluid.layers.nn as nn
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
import paddle.fluid.layers.io as io
def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1, neg_size=5):
""" network definition """
text = io.data(name="text", shape=[1], lod_level=1, dtype='int64')
pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64')
neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64')
text_emb = nn.embedding(
input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb")
pos_tag_emb = nn.embedding(
input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
neg_tag_emb = nn.embedding(
input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
conv_1d = fluid.nets.sequence_conv_pool(
input=text_emb,
num_filters=hid_dim,
filter_size=win_size,
act="tanh",
pool_type="max",
param_attr="cnn")
text_hid = fluid.layers.fc(input=conv_1d, size=emb_dim, param_attr="text_hid")
cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb)
mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid)
cos_neg_all = fluid.layers.sequence_reshape(input=mul_cos_neg, new_dim=neg_size)
#choose max negtive cosine
cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True)
#calculate hinge loss
loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=cos_pos,
shape=[-1, 1],
value=margin,
dtype='float32'),
cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max(
tensor.fill_constant_batch_size_like(
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
avg_cost = nn.mean(loss_part3)
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less)
return text, pos_tag, neg_tag, avg_cost, correct, cos_pos
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
此差异已折叠。
import sys
import six
import collections
import os
import csv
import re
def word_count(column_num, input_file, word_freq=None):
"""
compute word count from corpus
"""
if word_freq is None:
word_freq = collections.defaultdict(int)
data_file = csv.reader(input_file)
for row in data_file:
for w in re.split(r'\W+',row[column_num].strip()):
word_freq[w]+= 1
return word_freq
def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
"""
Build a word dictionary from the corpus, Keys of the dictionary are words,
and values are zero-based IDs of these words.
"""
word_freq = collections.defaultdict(int)
files = os.listdir(train_dir)
for fi in files:
with open(train_dir + '/' + fi, "r") as f:
word_freq = word_count(column_num, f, word_freq)
files = os.listdir(test_dir)
for fi in files:
with open(test_dir + '/' + fi, "r") as f:
word_freq = word_count(column_num, f, word_freq)
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(list(zip(words, six.moves.range(len(words)))))
return word_idx
def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, output_test_dir):
files = os.listdir(train_dir)
if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir)
for fi in files:
with open(train_dir + '/' + fi, "r") as f:
with open(output_train_dir + '/' + fi, "w") as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
wf.write(str(pos_index) + ",")
text_raw = re.split(r'\W+', row[2].strip())
l = [text_idx.get(w) for w in text_raw]
for w in l:
wf.write(str(w) + " ")
wf.write("\n")
files = os.listdir(test_dir)
if not os.path.exists(output_test_dir):
os.mkdir(output_test_dir)
for fi in files:
with open(test_dir + '/' + fi, "r") as f:
with open(output_test_dir + '/' + fi, "w") as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
wf.write(str(pos_index) + ",")
text_raw = re.split(r'\W+', row[2].strip())
l = [text_idx.get(w) for w in text_raw]
for w in l:
wf.write(str(w) + " ")
wf.write("\n")
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag):
print("start constuct word dict")
vocab_text = build_dict(2, 0, train_dir, test_dir)
with open(output_vocab_text, "w") as wf:
wf.write(str(len(vocab_text)) + "\n")
vocab_tag = build_dict(0, 0, train_dir, test_dir)
with open(output_vocab_tag, "w") as wf:
wf.write(str(len(vocab_tag)) + "\n")
print("construct word dict done\n")
write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir, output_test_dir)
train_dir = sys.argv[1]
test_dir = sys.argv[2]
output_train_dir = sys.argv[3]
output_test_dir = sys.argv[4]
output_vocab_text = sys.argv[5]
output_vocab_tag = sys.argv[6]
text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag)
...@@ -7,90 +7,83 @@ import math ...@@ -7,90 +7,83 @@ import math
import argparse import argparse
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers.nn as nn
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
import paddle.fluid.layers.io as io
import time import time
import utils import utils
import net
SEED = 102 SEED = 102
def parse_args(): def parse_args():
parser = argparse.ArgumentParser("TagSpace benchmark.") parser = argparse.ArgumentParser("TagSpace benchmark.")
parser.add_argument('train_file') parser.add_argument(
parser.add_argument('test_file') '--neg_size', type=int, default=3, help='neg/pos ratio')
parser.add_argument('--use_cuda', help='whether use gpu') parser.add_argument(
'--train_dir', type=str, default='train_data', help='train file address')
parser.add_argument(
'--vocab_text_path', type=str, default='vocab_text.txt', help='vocab_text file address')
parser.add_argument(
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab_text file address')
parser.add_argument(
'--model_dir', type=str, default='model_', help='model dir')
parser.add_argument(
'--batch_size', type=int, default=5, help='num of batch size')
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument(
'--pass_num', type=int, default=10, help='num of epoch')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether use gpu')
parser.add_argument(
'--parallel', type=int, default=0, help='whether parallel')
parser.add_argument(
'--base_lr', type=float, default=0.01, help='learning rate')
parser.add_argument(
'--num_devices', type=int, default=1, help='Number of GPU devices')
args = parser.parse_args() args = parser.parse_args()
return args return args
def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1, neg_size=5): def get_cards(args):
""" network definition """ return args.num_devices
text = io.data(name="text", shape=[1], lod_level=1, dtype='int64')
pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64')
neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64')
text_emb = nn.embedding(
input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb")
pos_tag_emb = nn.embedding(
input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
neg_tag_emb = nn.embedding(
input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
conv_1d = fluid.nets.sequence_conv_pool(
input=text_emb,
num_filters=hid_dim,
filter_size=win_size,
act="tanh",
pool_type="max",
param_attr="cnn")
text_hid = fluid.layers.fc(input=conv_1d, size=emb_dim, param_attr="text_hid")
cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb)
mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid)
cos_neg_all = fluid.layers.sequence_reshape(input=mul_cos_neg, new_dim=neg_size)
#choose max negtive cosine
cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True)
#calculate hinge loss
loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=cos_pos,
shape=[-1, 1],
value=margin,
dtype='float32'),
cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max(
tensor.fill_constant_batch_size_like(
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
avg_cost = nn.mean(loss_part3)
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less)
return text, pos_tag, neg_tag, avg_cost, correct, cos_pos
def train(train_reader, vocab_text, vocab_tag, base_lr, batch_size, neg_size, def train():
pass_num, use_cuda, model_dir): """ do training """
""" train network """
args = parse_args() args = parse_args()
vocab_text_size = len(vocab_text) train_dir = args.train_dir
vocab_tag_size = len(vocab_tag) vocab_text_path = args.vocab_text_path
vocab_tag_path = args.vocab_tag_path
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() use_cuda = True if args.use_cuda else False
parallel = True if args.parallel else False
batch_size = args.batch_size
neg_size = args.neg_size
print("use_cuda: {}, parallel: {}, batch_size: {}, neg_size: {} "
.format(use_cuda, parallel, batch_size, neg_size))
vocab_text_size, vocab_tag_size, train_reader = utils.prepare_data(
file_dir=train_dir, vocab_text_path=vocab_text_path,
vocab_tag_path=vocab_tag_path, neg_size=neg_size,
batch_size=batch_size * get_cards(args),
buffer_size=batch_size*100, is_train=True)
""" train network """
# Train program # Train program
text, pos_tag, neg_tag, avg_cost, correct, cos_pos = network(vocab_text_size, vocab_tag_size, neg_size=neg_size) text, pos_tag, neg_tag, avg_cost, correct, cos_pos = net.network(vocab_text_size, vocab_tag_size, neg_size=neg_size)
# Optimization to minimize lost # Optimization to minimize lost
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=base_lr) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
# Initialize executor # Initialize executor
startup_program = fluid.default_startup_program() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
loop_program = fluid.default_main_program()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup_program) if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
loss_name=avg_cost.name)
else:
train_exe = exe
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
exe.run(fluid.default_startup_program())
total_time = 0.0 total_time = 0.0
for pass_idx in range(pass_num): for pass_idx in range(pass_num):
epoch_idx = pass_idx + 1 epoch_idx = pass_idx + 1
...@@ -100,17 +93,16 @@ def train(train_reader, vocab_text, vocab_tag, base_lr, batch_size, neg_size, ...@@ -100,17 +93,16 @@ def train(train_reader, vocab_text, vocab_tag, base_lr, batch_size, neg_size,
lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place) lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place)
lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place) lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place)
lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place) lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place)
loss_val, correct_val = exe.run( loss_val, correct_val = train_exe.run(
loop_program,
feed={ feed={
"text": lod_text_seq, "text": lod_text_seq,
"pos_tag": lod_pos_tag, "pos_tag": lod_pos_tag,
"neg_tag": lod_neg_tag}, "neg_tag": lod_neg_tag},
fetch_list=[avg_cost, correct]) fetch_list=[avg_cost.name, correct.name])
if batch_id % 10 == 0: if batch_id % args.print_batch == 0:
print("TRAIN --> pass: {} batch_id: {} avg_cost: {}, acc: {}" print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}"
.format(pass_idx, batch_id, loss_val, .format(pass_idx, (batch_id+10) * batch_size, np.mean(loss_val),
float(correct_val) / batch_size)) float(np.sum(correct_val)) / batch_size))
t1 = time.time() t1 = time.time()
total_time += t1 - t0 total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" % print("epoch:%d num_steps:%d time_cost(s):%f" %
...@@ -121,27 +113,5 @@ def train(train_reader, vocab_text, vocab_tag, base_lr, batch_size, neg_size, ...@@ -121,27 +113,5 @@ def train(train_reader, vocab_text, vocab_tag, base_lr, batch_size, neg_size,
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("finish training") print("finish training")
def train_net():
""" do training """
args = parse_args()
train_file = args.train_file
test_file = args.test_file
use_cuda = True if args.use_cuda else False
batch_size = 100
neg_size = 3
vocab_text, vocab_tag, train_reader, test_reader = utils.prepare_data(
train_file, test_file, neg_size=neg_size, batch_size=batch_size, buffer_size=batch_size*100, word_freq_threshold=0)
train(
train_reader=train_reader,
vocab_text=vocab_text,
vocab_tag=vocab_tag,
base_lr=0.01,
batch_size=batch_size,
neg_size=neg_size,
pass_num=10,
use_cuda=use_cuda,
model_dir="model")
if __name__ == "__main__": if __name__ == "__main__":
train_net() train()
此差异已折叠。
import re import re
import sys import sys
import collections import collections
import os
import six import six
import time import time
import numpy as np import numpy as np
...@@ -23,30 +24,39 @@ def to_lodtensor(data, place): ...@@ -23,30 +24,39 @@ def to_lodtensor(data, place):
res.set_lod([lod]) res.set_lod([lod])
return res return res
def prepare_data(train_filename, def get_vocab_size(vocab_path):
test_filename, with open(vocab_path, "r") as rf:
line = rf.readline()
return int(line.strip())
def prepare_data(file_dir,
vocab_text_path,
vocab_tag_path,
batch_size, batch_size,
neg_size=1, neg_size,
buffer_size=1000, buffer_size,
word_freq_threshold=0, is_train=True):
enable_ce=False):
""" prepare the AG's News Topic Classification data """ """ prepare the AG's News Topic Classification data """
print("start constuct word dict") print("start read file")
vocab_text = build_dict(2, word_freq_threshold, train_filename, test_filename) if is_train:
vocab_tag = build_dict(0, word_freq_threshold, train_filename, test_filename) vocab_text_size = get_vocab_size(vocab_text_path)
print("construct word dict done\n") vocab_tag_size = get_vocab_size(vocab_tag_path)
train_reader = sort_batch( reader = sort_batch(
paddle.reader.shuffle( paddle.reader.shuffle(
train( train(
train_filename, vocab_text, vocab_tag, neg_size, file_dir, vocab_tag_size, neg_size,
buffer_size, data_type=DataType.SEQ), buffer_size, data_type=DataType.SEQ),
buf_size=buffer_size), buf_size=buffer_size),
batch_size, batch_size * 20) batch_size, batch_size * 20)
test_reader = sort_batch( else:
test( vocab_tag_size = get_vocab_size(vocab_tag_path)
test_filename, vocab_text, vocab_tag, neg_size, buffer_size, data_type=DataType.SEQ), vocab_text_size = 0
batch_size, batch_size * 20) reader = sort_batch(
return vocab_text, vocab_tag, train_reader, test_reader test(
file_dir, vocab_tag_size, buffer_size, data_type=DataType.SEQ),
batch_size, batch_size * 20)
return vocab_text_size, vocab_tag_size, reader
def sort_batch(reader, batch_size, sort_group_size, drop_last=False): def sort_batch(reader, batch_size, sort_group_size, drop_last=False):
""" """
...@@ -97,82 +107,57 @@ def sort_batch(reader, batch_size, sort_group_size, drop_last=False): ...@@ -97,82 +107,57 @@ def sort_batch(reader, batch_size, sort_group_size, drop_last=False):
class DataType(object): class DataType(object):
SEQ = 2 SEQ = 2
def word_count(column_num, input_file, word_freq=None): def train_reader_creator(file_dir, tag_size, neg_size, n, data_type):
"""
compute word count from corpus
"""
if word_freq is None:
word_freq = collections.defaultdict(int)
data_file = csv.reader(input_file)
for row in data_file:
for w in re.split(r'\W+',row[column_num].strip()):
word_freq[w]+= 1
return word_freq
def build_dict(column_num=2, min_word_freq=50, train_filename="", test_filename=""):
"""
Build a word dictionary from the corpus, Keys of the dictionary are words,
and values are zero-based IDs of these words.
"""
with open(train_filename) as trainf:
with open(test_filename) as testf:
word_freq = word_count(column_num, testf, word_count(column_num, trainf))
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(list(zip(words, six.moves.range(len(words)))))
return word_idx
def train_reader_creator(filename, text_idx, tag_idx, neg_size, n, data_type):
def reader(): def reader():
with open(filename) as input_file: files = os.listdir(file_dir)
data_file = csv.reader(input_file) for fi in files:
for row in data_file: with open(file_dir + '/' + fi, "r") as f:
text_raw = re.split(r'\W+', row[2].strip()) for l in f:
text = [text_idx.get(w) for w in text_raw] l = l.strip().split(",")
tag_raw = re.split(r'\W+', row[0].strip()) pos_index = int(l[0])
pos_index = tag_idx.get(tag_raw[0]) pos_tag = []
pos_tag=[] pos_tag.append(pos_index)
pos_tag.append(pos_index) text_raw = l[1].split()
neg_tag=[] text = [int(w) for w in text_raw]
max_iter = 100 neg_tag = []
now_iter = 0 max_iter = 100
sum_n = 0 now_iter = 0
while(sum_n < neg_size) : sum_n = 0
now_iter += 1 while(sum_n < neg_size) :
if now_iter > max_iter: now_iter += 1
print("error : only one class") if now_iter > max_iter:
sys.exit(0) print("error : only one class")
rand_i = np.random.randint(0, len(tag_idx)) sys.exit(0)
if rand_i != pos_index: rand_i = np.random.randint(0, tag_size)
neg_index=rand_i if rand_i != pos_index:
neg_tag.append(neg_index) neg_index = rand_i
sum_n += 1 neg_tag.append(neg_index)
if n > 0 and len(text) > n: continue sum_n += 1
yield text, pos_tag, neg_tag if n > 0 and len(text) > n: continue
yield text, pos_tag, neg_tag
return reader return reader
def test_reader_creator(filename, text_idx, tag_idx, n, data_type): def test_reader_creator(file_dir, tag_size, n, data_type):
def reader(): def reader():
with open(filename) as input_file: files = os.listdir(file_dir)
data_file = csv.reader(input_file) for fi in files:
for row in data_file: with open(file_dir + '/' + fi, "r") as f:
text_raw = re.split(r'\W+', row[2].strip()) for l in f:
text = [text_idx.get(w) for w in text_raw] l = l.strip().split(",")
tag_raw = re.split(r'\W+', row[0].strip()) pos_index = int(l[0])
pos_index = tag_idx.get(tag_raw[0]) pos_tag = []
pos_tag = [] pos_tag.append(pos_index)
pos_tag.append(pos_index) text_raw = l[1].split()
for ii in range(len(tag_idx)): text = [int(w) for w in text_raw]
tag = [] for ii in range(tag_size):
tag.append(ii) tag = []
yield text, tag, pos_tag tag.append(ii)
yield text, tag, pos_tag
return reader return reader
def train(filename, text_idx, tag_idx, neg_size, n, data_type=DataType.SEQ): def train(train_dir, tag_size, neg_size, n, data_type=DataType.SEQ):
return train_reader_creator(filename, text_idx, tag_idx, neg_size, n, data_type) return train_reader_creator(train_dir, tag_size, neg_size, n, data_type)
def test(filename, text_idx, tag_idx, neg_size, n, data_type=DataType.SEQ): def test(test_dir, tag_size, n, data_type=DataType.SEQ):
return test_reader_creator(filename, text_idx, tag_idx, n, data_type) return test_reader_creator(test_dir, tag_size, n, data_type)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册