未验证 提交 e5a1abfe 编写于 作者: Z zhang wenhui 提交者: GitHub

cherry-pick 1.6 to develop (#4004)

上级 2520773d
...@@ -45,6 +45,9 @@ session-based推荐应用场景非常广泛,比如用户的商品浏览、新 ...@@ -45,6 +45,9 @@ session-based推荐应用场景非常广泛,比如用户的商品浏览、新
运行样例程序可跳过'RSC15 数据下载及预处理'部分 运行样例程序可跳过'RSC15 数据下载及预处理'部分
**要求使用PaddlePaddle 1.6及以上版本或适当的develop版本。**
同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122296) 同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122296)
## RSC15 数据下载及预处理 ## RSC15 数据下载及预处理
...@@ -278,7 +281,7 @@ model:model_r@20/epoch_10 recall@20:0.681 time_cost(s):12.2 ...@@ -278,7 +281,7 @@ model:model_r@20/epoch_10 recall@20:0.681 time_cost(s):12.2
可参考cluster_train.py 配置其他多机环境 可参考cluster_train.py 配置其他多机环境
运行命令本地模拟多机场景 运行命令本地模拟多机场景, 暂不支持windows
``` ```
sh cluster_train.sh sh cluster_train.sh
``` ```
......
...@@ -2,8 +2,8 @@ import sys ...@@ -2,8 +2,8 @@ import sys
def convert_format(input, output): def convert_format(input, output):
with open(input) as rf: with open(input, "r", encoding='utf-8') as rf:
with open(output, "w") as wf: with open(output, "w", encoding='utf-8') as wf:
last_sess = -1 last_sess = -1
sign = 1 sign = 1
i = 0 i = 0
......
...@@ -71,6 +71,7 @@ def infer(test_reader, use_cuda, model_path): ...@@ -71,6 +71,7 @@ def infer(test_reader, use_cuda, model_path):
if __name__ == "__main__": if __name__ == "__main__":
utils.check_version()
args = parse_args() args = parse_args()
start_index = args.start_index start_index = args.start_index
last_index = args.last_index last_index = args.last_index
......
...@@ -84,6 +84,7 @@ def infer(args, vocab_size, test_reader, use_cuda): ...@@ -84,6 +84,7 @@ def infer(args, vocab_size, test_reader, use_cuda):
if __name__ == "__main__": if __name__ == "__main__":
utils.check_version()
args = parse_args() args = parse_args()
start_index = args.start_index start_index = args.start_index
last_index = args.last_index last_index = args.last_index
......
...@@ -10,12 +10,12 @@ def all_vocab_network(vocab_size, ...@@ -10,12 +10,12 @@ def all_vocab_network(vocab_size,
gru_lr_x = 1.0 gru_lr_x = 1.0
fc_lr_x = 1.0 fc_lr_x = 1.0
# Input data # Input data
src_wordseq = fluid.layers.data( src_wordseq = fluid.data(
name="src_wordseq", shape=[1], dtype="int64", lod_level=1) name="src_wordseq", shape=[None, 1], dtype="int64", lod_level=1)
dst_wordseq = fluid.layers.data( dst_wordseq = fluid.data(
name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) name="dst_wordseq", shape=[None, 1], dtype="int64", lod_level=1)
emb = fluid.layers.embedding( emb = fluid.embedding(
input=src_wordseq, input=src_wordseq,
size=[vocab_size, hid_size], size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
...@@ -56,19 +56,20 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2): ...@@ -56,19 +56,20 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2):
gru_lr_x = 1.0 gru_lr_x = 1.0
fc_lr_x = 1.0 fc_lr_x = 1.0
# Input data # Input data
src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1) src = fluid.data(name="src", shape=[None, 1], dtype="int64", lod_level=1)
pos_label = fluid.layers.data( pos_label = fluid.data(
name="pos_label", shape=[1], dtype="int64", lod_level=1) name="pos_label", shape=[None, 1], dtype="int64", lod_level=1)
label = fluid.layers.data( label = fluid.data(
name="label", shape=[neg_size + 1], dtype="int64", lod_level=1) name="label", shape=[None, neg_size + 1], dtype="int64", lod_level=1)
emb_src = fluid.layers.embedding( emb_src = fluid.embedding(
input=src, input=src,
size=[vocab_size, hid_size], size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name="emb", name="emb",
initializer=fluid.initializer.XavierInitializer(), initializer=fluid.initializer.XavierInitializer(),
learning_rate=emb_lr_x)) learning_rate=emb_lr_x))
emb_src = fluid.layers.squeeze(input=emb_src, axes=[1])
emb_src_drop = fluid.layers.dropout(emb_src, dropout_prob=drop_out) emb_src_drop = fluid.layers.dropout(emb_src, dropout_prob=drop_out)
...@@ -90,7 +91,7 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2): ...@@ -90,7 +91,7 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2):
gru_h0_drop = fluid.layers.dropout(gru_h0, dropout_prob=drop_out) gru_h0_drop = fluid.layers.dropout(gru_h0, dropout_prob=drop_out)
label_re = fluid.layers.sequence_reshape(input=label, new_dim=1) label_re = fluid.layers.sequence_reshape(input=label, new_dim=1)
emb_label = fluid.layers.embedding( emb_label1 = fluid.embedding(
input=label_re, input=label_re,
size=[vocab_size, hid_size], size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
...@@ -98,6 +99,7 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2): ...@@ -98,6 +99,7 @@ def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2):
initializer=fluid.initializer.XavierInitializer(), initializer=fluid.initializer.XavierInitializer(),
learning_rate=emb_lr_x)) learning_rate=emb_lr_x))
emb_label = fluid.layers.squeeze(input=emb_label1, axes=[1])
emb_label_drop = fluid.layers.dropout(emb_label, dropout_prob=drop_out) emb_label_drop = fluid.layers.dropout(emb_label, dropout_prob=drop_out)
gru_exp = fluid.layers.expand( gru_exp = fluid.layers.expand(
...@@ -120,19 +122,20 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2): ...@@ -120,19 +122,20 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2):
gru_lr_x = 1.0 gru_lr_x = 1.0
fc_lr_x = 1.0 fc_lr_x = 1.0
# Input data # Input data
src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1) src = fluid.data(name="src", shape=[None, 1], dtype="int64", lod_level=1)
pos_label = fluid.layers.data( pos_label = fluid.data(
name="pos_label", shape=[1], dtype="int64", lod_level=1) name="pos_label", shape=[None, 1], dtype="int64", lod_level=1)
label = fluid.layers.data( label = fluid.data(
name="label", shape=[neg_size + 1], dtype="int64", lod_level=1) name="label", shape=[None, neg_size + 1], dtype="int64", lod_level=1)
emb_src = fluid.layers.embedding( emb_src = fluid.embedding(
input=src, input=src,
size=[vocab_size, hid_size], size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name="emb", name="emb",
initializer=fluid.initializer.XavierInitializer(), initializer=fluid.initializer.XavierInitializer(),
learning_rate=emb_lr_x)) learning_rate=emb_lr_x))
emb_src = fluid.layers.squeeze(input=emb_src, axes=[1])
emb_src_drop = fluid.layers.dropout(emb_src, dropout_prob=drop_out) emb_src_drop = fluid.layers.dropout(emb_src, dropout_prob=drop_out)
...@@ -154,13 +157,14 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2): ...@@ -154,13 +157,14 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2):
gru_h0_drop = fluid.layers.dropout(gru_h0, dropout_prob=drop_out) gru_h0_drop = fluid.layers.dropout(gru_h0, dropout_prob=drop_out)
label_re = fluid.layers.sequence_reshape(input=label, new_dim=1) label_re = fluid.layers.sequence_reshape(input=label, new_dim=1)
emb_label = fluid.layers.embedding( emb_label1 = fluid.embedding(
input=label_re, input=label_re,
size=[vocab_size, hid_size], size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name="emb", name="emb",
initializer=fluid.initializer.XavierInitializer(), initializer=fluid.initializer.XavierInitializer(),
learning_rate=emb_lr_x)) learning_rate=emb_lr_x))
emb_label = fluid.layers.squeeze(input=emb_label1, axes=[1])
emb_label_drop = fluid.layers.dropout(emb_label, dropout_prob=drop_out) emb_label_drop = fluid.layers.dropout(emb_label, dropout_prob=drop_out)
...@@ -180,8 +184,8 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2): ...@@ -180,8 +184,8 @@ def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2):
def infer_network(vocab_size, batch_size, hid_size, dropout=0.2): def infer_network(vocab_size, batch_size, hid_size, dropout=0.2):
src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1) src = fluid.data(name="src", shape=[None, 1], dtype="int64", lod_level=1)
emb_src = fluid.layers.embedding( emb_src = fluid.embedding(
input=src, size=[vocab_size, hid_size], param_attr="emb") input=src, size=[vocab_size, hid_size], param_attr="emb")
emb_src_drop = fluid.layers.dropout( emb_src_drop = fluid.layers.dropout(
emb_src, dropout_prob=dropout, is_test=True) emb_src, dropout_prob=dropout, is_test=True)
...@@ -198,20 +202,18 @@ def infer_network(vocab_size, batch_size, hid_size, dropout=0.2): ...@@ -198,20 +202,18 @@ def infer_network(vocab_size, batch_size, hid_size, dropout=0.2):
gru_h0_drop = fluid.layers.dropout( gru_h0_drop = fluid.layers.dropout(
gru_h0, dropout_prob=dropout, is_test=True) gru_h0, dropout_prob=dropout, is_test=True)
all_label = fluid.layers.data( all_label = fluid.data(
name="all_label", name="all_label", shape=[vocab_size, 1], dtype="int64")
shape=[vocab_size, 1], emb_all_label = fluid.embedding(
dtype="int64",
append_batch_size=False)
emb_all_label = fluid.layers.embedding(
input=all_label, size=[vocab_size, hid_size], param_attr="emb") input=all_label, size=[vocab_size, hid_size], param_attr="emb")
emb_all_label = fluid.layers.squeeze(input=emb_all_label, axes=[1])
emb_all_label_drop = fluid.layers.dropout( emb_all_label_drop = fluid.layers.dropout(
emb_all_label, dropout_prob=dropout, is_test=True) emb_all_label, dropout_prob=dropout, is_test=True)
all_pre = fluid.layers.matmul( all_pre = fluid.layers.matmul(
gru_h0_drop, emb_all_label_drop, transpose_y=True) gru_h0_drop, emb_all_label_drop, transpose_y=True)
pos_label = fluid.layers.data( pos_label = fluid.data(
name="pos_label", shape=[1], dtype="int64", lod_level=1) name="pos_label", shape=[None, 1], dtype="int64", lod_level=1)
acc = fluid.layers.accuracy(input=all_pre, label=pos_label, k=20) acc = fluid.layers.accuracy(input=all_pre, label=pos_label, k=20)
return acc return acc
...@@ -2,6 +2,11 @@ import sys ...@@ -2,6 +2,11 @@ import sys
import six import six
import collections import collections
import os import os
import sys
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
def word_count(input_file, word_freq=None): def word_count(input_file, word_freq=None):
""" """
...@@ -25,11 +30,11 @@ def build_dict(min_word_freq=0, train_dir="", test_dir=""): ...@@ -25,11 +30,11 @@ def build_dict(min_word_freq=0, train_dir="", test_dir=""):
word_freq = collections.defaultdict(int) word_freq = collections.defaultdict(int)
files = os.listdir(train_dir) files = os.listdir(train_dir)
for fi in files: for fi in files:
with open(train_dir + '/' + fi, "r") as f: with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(f, word_freq) word_freq = word_count(f, word_freq)
files = os.listdir(test_dir) files = os.listdir(test_dir)
for fi in files: for fi in files:
with open(test_dir + '/' + fi, "r") as f: with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(f, word_freq) word_freq = word_count(f, word_freq)
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq] word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
...@@ -39,13 +44,16 @@ def build_dict(min_word_freq=0, train_dir="", test_dir=""): ...@@ -39,13 +44,16 @@ def build_dict(min_word_freq=0, train_dir="", test_dir=""):
return word_idx return word_idx
def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_dir): def write_paddle(word_idx, train_dir, test_dir, output_train_dir,
output_test_dir):
files = os.listdir(train_dir) files = os.listdir(train_dir)
if not os.path.exists(output_train_dir): if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir) os.mkdir(output_train_dir)
for fi in files: for fi in files:
with open(train_dir + '/' + fi, "r") as f: with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open(output_train_dir + '/' + fi, "w") as wf: with open(
os.path.join(output_train_dir, fi), "w",
encoding='utf-8') as wf:
for l in f: for l in f:
l = l.strip().split() l = l.strip().split()
l = [word_idx.get(w) for w in l] l = [word_idx.get(w) for w in l]
...@@ -57,8 +65,10 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di ...@@ -57,8 +65,10 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di
if not os.path.exists(output_test_dir): if not os.path.exists(output_test_dir):
os.mkdir(output_test_dir) os.mkdir(output_test_dir)
for fi in files: for fi in files:
with open(test_dir + '/' + fi, "r") as f: with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open(output_test_dir + '/' + fi, "w") as wf: with open(
os.path.join(output_test_dir, fi), "w",
encoding='utf-8') as wf:
for l in f: for l in f:
l = l.strip().split() l = l.strip().split()
l = [word_idx.get(w) for w in l] l = [word_idx.get(w) for w in l]
...@@ -66,9 +76,11 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di ...@@ -66,9 +76,11 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di
wf.write(str(w) + " ") wf.write(str(w) + " ")
wf.write("\n") wf.write("\n")
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab):
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab):
vocab = build_dict(0, train_dir, test_dir) vocab = build_dict(0, train_dir, test_dir)
with open(output_vocab, "w") as wf: with open(output_vocab, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab)) + "\n") wf.write(str(len(vocab)) + "\n")
#wf.write(str(vocab)) #wf.write(str(vocab))
write_paddle(vocab, train_dir, test_dir, output_train_dir, output_test_dir) write_paddle(vocab, train_dir, test_dir, output_train_dir, output_test_dir)
...@@ -79,4 +91,5 @@ test_dir = sys.argv[2] ...@@ -79,4 +91,5 @@ test_dir = sys.argv[2]
output_train_dir = sys.argv[3] output_train_dir = sys.argv[3]
output_test_dir = sys.argv[4] output_test_dir = sys.argv[4]
output_vocab = sys.argv[5] output_vocab = sys.argv[5]
text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab) text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab)
...@@ -58,8 +58,8 @@ def train(): ...@@ -58,8 +58,8 @@ def train():
""" do training """ """ do training """
args = parse_args() args = parse_args()
if args.enable_ce: if args.enable_ce:
fluid.default_startup_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED
fluid.default_main_program().random_seed = SEED fluid.default_main_program().random_seed = SEED
hid_size = args.hid_size hid_size = args.hid_size
train_dir = args.train_dir train_dir = args.train_dir
vocab_path = args.vocab_path vocab_path = args.vocab_path
...@@ -143,17 +143,16 @@ def train(): ...@@ -143,17 +143,16 @@ def train():
if args.use_cuda: if args.use_cuda:
gpu_num = device[1] gpu_num = device[1]
print("kpis\teach_pass_duration_gpu%s\t%s" % print("kpis\teach_pass_duration_gpu%s\t%s" %
(gpu_num, total_time / epoch_idx)) (gpu_num, total_time / epoch_idx))
print("kpis\ttrain_ppl_gpu%s\t%s" % print("kpis\ttrain_ppl_gpu%s\t%s" % (gpu_num, ce_ppl))
(gpu_num, ce_ppl))
else: else:
cpu_num = device[1] cpu_num = device[1]
threads_num = device[2] threads_num = device[2]
print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" % print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" %
(cpu_num, threads_num, total_time / epoch_idx)) (cpu_num, threads_num, total_time / epoch_idx))
print("kpis\ttrain_ppl_cpu%s_thread%s\t%s" % print("kpis\ttrain_ppl_cpu%s_thread%s\t%s" %
(cpu_num, threads_num, ce_ppl)) (cpu_num, threads_num, ce_ppl))
print("finish training") print("finish training")
...@@ -166,7 +165,8 @@ def get_device(args): ...@@ -166,7 +165,8 @@ def get_device(args):
threads_num = os.environ.get('NUM_THREADS', 1) threads_num = os.environ.get('NUM_THREADS', 1)
cpu_num = os.environ.get('CPU_NUM', 1) cpu_num = os.environ.get('CPU_NUM', 1)
return "cpu", int(cpu_num), int(threads_num) return "cpu", int(cpu_num), int(threads_num)
if __name__ == "__main__": if __name__ == "__main__":
utils.check_version()
train() train()
...@@ -128,4 +128,5 @@ def train(): ...@@ -128,4 +128,5 @@ def train():
if __name__ == "__main__": if __name__ == "__main__":
utils.check_version()
train() train()
...@@ -6,6 +6,7 @@ import numpy as np ...@@ -6,6 +6,7 @@ import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle import paddle
import os import os
import io
def to_lodtensor(data, place): def to_lodtensor(data, place):
...@@ -86,7 +87,7 @@ def to_lodtensor_bpr_test(raw_data, vocab_size, place): ...@@ -86,7 +87,7 @@ def to_lodtensor_bpr_test(raw_data, vocab_size, place):
def get_vocab_size(vocab_path): def get_vocab_size(vocab_path):
with open(vocab_path, "r") as rf: with io.open(vocab_path, "r", encoding='utf-8') as rf:
line = rf.readline() line = rf.readline()
return int(line.strip()) return int(line.strip())
...@@ -110,12 +111,28 @@ def prepare_data(file_dir, ...@@ -110,12 +111,28 @@ def prepare_data(file_dir,
batch_size * 20) batch_size * 20)
else: else:
vocab_size = get_vocab_size(vocab_path) vocab_size = get_vocab_size(vocab_path)
reader = paddle.batch( reader = fluid.io.batch(
test( test(
file_dir, buffer_size, data_type=DataType.SEQ), batch_size) file_dir, buffer_size, data_type=DataType.SEQ), batch_size)
return vocab_size, reader return vocab_size, reader
def check_version():
"""
Log error and exit when the installed version of paddlepaddle is
not satisfied.
"""
err = "PaddlePaddle version 1.6 or higher is required, " \
"or a suitable develop version is satisfied as well. \n" \
"Please make sure the version is good with your code." \
try:
fluid.require_version('1.6.0')
except Exception as e:
logger.error(err)
sys.exit(1)
def sort_batch(reader, batch_size, sort_group_size, drop_last=False): def sort_batch(reader, batch_size, sort_group_size, drop_last=False):
""" """
Create a batched reader. Create a batched reader.
...@@ -170,7 +187,8 @@ def reader_creator(file_dir, n, data_type): ...@@ -170,7 +187,8 @@ def reader_creator(file_dir, n, data_type):
def reader(): def reader():
files = os.listdir(file_dir) files = os.listdir(file_dir)
for fi in files: for fi in files:
with open(file_dir + '/' + fi, "r") as f: with io.open(
os.path.join(file_dir, fi), "r", encoding='utf-8') as f:
for l in f: for l in f:
if DataType.SEQ == data_type: if DataType.SEQ == data_type:
l = l.strip().split() l = l.strip().split()
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
## Introduction ## Introduction
In personalized recommendation scenario, a user often is provided with several items from personalized interest matching model. In real world application, a user may have multiple views of features, say user-id, age, click-history of items, search queries. A item, e.g. news, may also have multiple views of features like news title, news category, images in news and so on. Multi-view Simnet is matching a model that combine users' and items' multiple views of features into one unified model. The model can be used in many industrial product like Baidu's feed news. The model is adapted from the paper A Multi-View Deep Learning(MV-DNN) Approach for Cross Domain User Modeling in Recommendation Systems, WWW 2015. The difference between our model and the MV-DNN is that we also consider multiple feature views of users. In personalized recommendation scenario, a user often is provided with several items from personalized interest matching model. In real world application, a user may have multiple views of features, say user-id, age, click-history of items, search queries. A item, e.g. news, may also have multiple views of features like news title, news category, images in news and so on. Multi-view Simnet is matching a model that combine users' and items' multiple views of features into one unified model. The model can be used in many industrial product like Baidu's feed news. The model is adapted from the paper A Multi-View Deep Learning(MV-DNN) Approach for Cross Domain User Modeling in Recommendation Systems, WWW 2015. The difference between our model and the MV-DNN is that we also consider multiple feature views of users.
**Now all models in PaddleRec require PaddlePaddle version 1.6 or higher, or suitable develop version.**
We also recommend users to take a look at the [IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122294) We also recommend users to take a look at the [IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122294)
## Dataset ## Dataset
......
...@@ -31,6 +31,22 @@ logger = logging.getLogger("fluid") ...@@ -31,6 +31,22 @@ logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
def check_version():
"""
Log error and exit when the installed version of paddlepaddle is
not satisfied.
"""
err = "PaddlePaddle version 1.6 or higher is required, " \
"or a suitable develop version is satisfied as well. \n" \
"Please make sure the version is good with your code." \
try:
fluid.require_version('1.6.0')
except Exception as e:
logger.error(err)
sys.exit(1)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser("multi-view simnet") parser = argparse.ArgumentParser("multi-view simnet")
parser.add_argument("--train_file", type=str, help="Training file") parser.add_argument("--train_file", type=str, help="Training file")
...@@ -116,4 +132,5 @@ def main(): ...@@ -116,4 +132,5 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
check_version()
main() main()
...@@ -53,7 +53,6 @@ class CNNEncoder(object): ...@@ -53,7 +53,6 @@ class CNNEncoder(object):
pool_type=self.pool_type, pool_type=self.pool_type,
param_attr=self.param_name + ".param", param_attr=self.param_name + ".param",
bias_attr=self.param_name + ".bias") bias_attr=self.param_name + ".bias")
class GrnnEncoder(object): class GrnnEncoder(object):
...@@ -64,12 +63,11 @@ class GrnnEncoder(object): ...@@ -64,12 +63,11 @@ class GrnnEncoder(object):
self.hidden_size = hidden_size self.hidden_size = hidden_size
def forward(self, emb): def forward(self, emb):
fc0 = nn.fc( fc0 = nn.fc(input=emb,
input=emb, size=self.hidden_size * 3,
size=self.hidden_size * 3, param_attr=self.param_name + "_fc.w",
param_attr=self.param_name + "_fc.w", bias_attr=False)
bias_attr=False)
gru_h = nn.dynamic_gru( gru_h = nn.dynamic_gru(
input=fc0, input=fc0,
size=self.hidden_size, size=self.hidden_size,
...@@ -125,34 +123,34 @@ class MultiviewSimnet(object): ...@@ -125,34 +123,34 @@ class MultiviewSimnet(object):
def train_net(self): def train_net(self):
# input fields for query, pos_title, neg_title # input fields for query, pos_title, neg_title
q_slots = [ q_slots = [
io.data( fluid.data(
name="q%d" % i, shape=[1], lod_level=1, dtype='int64') name="q%d" % i, shape=[None, 1], lod_level=1, dtype='int64')
for i in range(len(self.query_encoders)) for i in range(len(self.query_encoders))
] ]
pt_slots = [ pt_slots = [
io.data( fluid.data(
name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') name="pt%d" % i, shape=[None, 1], lod_level=1, dtype='int64')
for i in range(len(self.title_encoders)) for i in range(len(self.title_encoders))
] ]
nt_slots = [ nt_slots = [
io.data( fluid.data(
name="nt%d" % i, shape=[1], lod_level=1, dtype='int64') name="nt%d" % i, shape=[None, 1], lod_level=1, dtype='int64')
for i in range(len(self.title_encoders)) for i in range(len(self.title_encoders))
] ]
# lookup embedding for each slot # lookup embedding for each slot
q_embs = [ q_embs = [
nn.embedding( fluid.embedding(
input=query, size=self.emb_shape, param_attr="emb") input=query, size=self.emb_shape, param_attr="emb")
for query in q_slots for query in q_slots
] ]
pt_embs = [ pt_embs = [
nn.embedding( fluid.embedding(
input=title, size=self.emb_shape, param_attr="emb") input=title, size=self.emb_shape, param_attr="emb")
for title in pt_slots for title in pt_slots
] ]
nt_embs = [ nt_embs = [
nn.embedding( fluid.embedding(
input=title, size=self.emb_shape, param_attr="emb") input=title, size=self.emb_shape, param_attr="emb")
for title in nt_slots for title in nt_slots
] ]
...@@ -174,9 +172,18 @@ class MultiviewSimnet(object): ...@@ -174,9 +172,18 @@ class MultiviewSimnet(object):
nt_concat = nn.concat(nt_encodes) nt_concat = nn.concat(nt_encodes)
# projection of hidden layer # projection of hidden layer
q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', bias_attr='q_fc.b') q_hid = nn.fc(q_concat,
pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b') size=self.hidden_size,
nt_hid = nn.fc(nt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b') param_attr='q_fc.w',
bias_attr='q_fc.b')
pt_hid = nn.fc(pt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
nt_hid = nn.fc(nt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
# cosine of hidden layers # cosine of hidden layers
cos_pos = nn.cos_sim(q_hid, pt_hid) cos_pos = nn.cos_sim(q_hid, pt_hid)
...@@ -205,23 +212,23 @@ class MultiviewSimnet(object): ...@@ -205,23 +212,23 @@ class MultiviewSimnet(object):
def pred_net(self, query_fields, pos_title_fields, neg_title_fields): def pred_net(self, query_fields, pos_title_fields, neg_title_fields):
q_slots = [ q_slots = [
io.data( fluid.data(
name="q%d" % i, shape=[1], lod_level=1, dtype='int64') name="q%d" % i, shape=[None, 1], lod_level=1, dtype='int64')
for i in range(len(self.query_encoders)) for i in range(len(self.query_encoders))
] ]
pt_slots = [ pt_slots = [
io.data( fluid.data(
name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') name="pt%d" % i, shape=[None, 1], lod_level=1, dtype='int64')
for i in range(len(self.title_encoders)) for i in range(len(self.title_encoders))
] ]
# lookup embedding for each slot # lookup embedding for each slot
q_embs = [ q_embs = [
nn.embedding( fluid.embedding(
input=query, size=self.emb_shape, param_attr="emb") input=query, size=self.emb_shape, param_attr="emb")
for query in q_slots for query in q_slots
] ]
pt_embs = [ pt_embs = [
nn.embedding( fluid.embedding(
input=title, size=self.emb_shape, param_attr="emb") input=title, size=self.emb_shape, param_attr="emb")
for title in pt_slots for title in pt_slots
] ]
...@@ -236,8 +243,14 @@ class MultiviewSimnet(object): ...@@ -236,8 +243,14 @@ class MultiviewSimnet(object):
q_concat = nn.concat(q_encodes) q_concat = nn.concat(q_encodes)
pt_concat = nn.concat(pt_encodes) pt_concat = nn.concat(pt_encodes)
# projection of hidden layer # projection of hidden layer
q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', bias_attr='q_fc.b') q_hid = nn.fc(q_concat,
pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b') size=self.hidden_size,
param_attr='q_fc.w',
bias_attr='q_fc.b')
pt_hid = nn.fc(pt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
# cosine of hidden layers # cosine of hidden layers
cos = nn.cos_sim(q_hid, pt_hid) cos = nn.cos_sim(q_hid, pt_hid)
return cos return cos
...@@ -88,6 +88,22 @@ def parse_args(): ...@@ -88,6 +88,22 @@ def parse_args():
return parser.parse_args() return parser.parse_args()
def check_version():
"""
Log error and exit when the installed version of paddlepaddle is
not satisfied.
"""
err = "PaddlePaddle version 1.6 or higher is required, " \
"or a suitable develop version is satisfied as well. \n" \
"Please make sure the version is good with your code." \
try:
fluid.require_version('1.6.0')
except Exception as e:
logger.error(err)
sys.exit(1)
def start_train(args): def start_train(args):
if args.enable_ce: if args.enable_ce:
SEED = 102 SEED = 102
...@@ -145,7 +161,7 @@ def start_train(args): ...@@ -145,7 +161,7 @@ def start_train(args):
# only for ce # only for ce
if args.enable_ce: if args.enable_ce:
threads_num, cpu_num = get_cards(args) threads_num, cpu_num = get_cards(args)
epoch_idx = args.epochs epoch_idx = args.epochs
ce_loss = 0 ce_loss = 0
try: try:
ce_loss = ce_info[-2] ce_loss = ce_info[-2]
...@@ -153,9 +169,9 @@ def start_train(args): ...@@ -153,9 +169,9 @@ def start_train(args):
logger.error("ce info error") logger.error("ce info error")
print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" % print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" %
(cpu_num, threads_num, total_time / epoch_idx)) (cpu_num, threads_num, total_time / epoch_idx))
print("kpis\ttrain_loss_cpu%s_thread%s\t%s" % print("kpis\ttrain_loss_cpu%s_thread%s\t%s" %
(cpu_num, threads_num, ce_loss)) (cpu_num, threads_num, ce_loss))
def get_cards(args): def get_cards(args):
...@@ -170,4 +186,5 @@ def main(): ...@@ -170,4 +186,5 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
check_version()
main() main()
...@@ -12,6 +12,10 @@ Sequence Semantic Retrieval(SSR) Model shares the similar idea with Multi-Rate D ...@@ -12,6 +12,10 @@ Sequence Semantic Retrieval(SSR) Model shares the similar idea with Multi-Rate D
- The idea of SSR is to model a user's personalized interest of an item through matching model structure, and the representation of a news item can be computed online even the news item does not exist in training dataset. - The idea of SSR is to model a user's personalized interest of an item through matching model structure, and the representation of a news item can be computed online even the news item does not exist in training dataset.
- With the representation of news items, we are able to build an vector indexing service online for news prediction and this is the retrieval part of SSR. - With the representation of news items, we are able to build an vector indexing service online for news prediction and this is the retrieval part of SSR.
## Version
**Now all models in PaddleRec require PaddlePaddle version 1.6 or higher, or suitable develop version.**
## Dataset ## Dataset
Dataset preprocessing follows the method of [GRU4Rec Project](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/gru4rec). Note that you should reuse scripts from GRU4Rec project for data preprocessing. Dataset preprocessing follows the method of [GRU4Rec Project](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/gru4rec). Note that you should reuse scripts from GRU4Rec project for data preprocessing.
...@@ -39,7 +43,7 @@ cpu 单机多卡训练 ...@@ -39,7 +43,7 @@ cpu 单机多卡训练
CPU_NUM=10 python train.py --train_dir train_data --use_cuda 0 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 10 CPU_NUM=10 python train.py --train_dir train_data --use_cuda 0 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 10
``` ```
本地模拟多机训练 本地模拟多机训练, 不支持windows.
``` bash ``` bash
sh cluster_train.sh sh cluster_train.sh
``` ```
......
...@@ -120,6 +120,7 @@ def infer(args, vocab_size, test_reader): ...@@ -120,6 +120,7 @@ def infer(args, vocab_size, test_reader):
if __name__ == "__main__": if __name__ == "__main__":
utils.check_version()
args = parse_args() args = parse_args()
start_index = args.start_index start_index = args.start_index
last_index = args.last_index last_index = args.last_index
......
...@@ -86,16 +86,17 @@ class SequenceSemanticRetrieval(object): ...@@ -86,16 +86,17 @@ class SequenceSemanticRetrieval(object):
return correct return correct
def train(self): def train(self):
user_data = io.data(name="user", shape=[1], dtype="int64", lod_level=1) user_data = fluid.data(
pos_item_data = io.data( name="user", shape=[None, 1], dtype="int64", lod_level=1)
name="p_item", shape=[1], dtype="int64", lod_level=1) pos_item_data = fluid.data(
neg_item_data = io.data( name="p_item", shape=[None, 1], dtype="int64", lod_level=1)
name="n_item", shape=[1], dtype="int64", lod_level=1) neg_item_data = fluid.data(
user_emb = nn.embedding( name="n_item", shape=[None, 1], dtype="int64", lod_level=1)
user_emb = fluid.embedding(
input=user_data, size=self.emb_shape, param_attr="emb.item") input=user_data, size=self.emb_shape, param_attr="emb.item")
pos_item_emb = nn.embedding( pos_item_emb = fluid.embedding(
input=pos_item_data, size=self.emb_shape, param_attr="emb.item") input=pos_item_data, size=self.emb_shape, param_attr="emb.item")
neg_item_emb = nn.embedding( neg_item_emb = fluid.embedding(
input=neg_item_data, size=self.emb_shape, param_attr="emb.item") input=neg_item_data, size=self.emb_shape, param_attr="emb.item")
user_enc = self.user_encoder.forward(user_emb) user_enc = self.user_encoder.forward(user_emb)
pos_item_enc = self.item_encoder.forward(pos_item_emb) pos_item_enc = self.item_encoder.forward(pos_item_emb)
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import random import random
import io
class Dataset: class Dataset:
...@@ -33,7 +34,7 @@ class YoochooseVocab(Vocab): ...@@ -33,7 +34,7 @@ class YoochooseVocab(Vocab):
def load(self, filelist): def load(self, filelist):
idx = 0 idx = 0
for f in filelist: for f in filelist:
with open(f, "r") as fin: with io.open(f, "r", encoding='utf-8') as fin:
for line in fin: for line in fin:
group = line.strip().split() group = line.strip().split()
for item in group: for item in group:
...@@ -64,7 +65,7 @@ class YoochooseDataset(Dataset): ...@@ -64,7 +65,7 @@ class YoochooseDataset(Dataset):
def _reader_creator(self, filelist, is_train): def _reader_creator(self, filelist, is_train):
def reader(): def reader():
for f in filelist: for f in filelist:
with open(f, 'r') as fin: with io.open(f, 'r', encoding='utf-8') as fin:
line_idx = 0 line_idx = 0
for line in fin: for line in fin:
ids = line.strip().split() ids = line.strip().split()
......
...@@ -68,9 +68,9 @@ def get_cards(args): ...@@ -68,9 +68,9 @@ def get_cards(args):
def train(args): def train(args):
if args.enable_ce: if args.enable_ce:
SEED = 102 SEED = 102
fluid.default_startup_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED
fluid.default_main_program().random_seed = SEED fluid.default_main_program().random_seed = SEED
use_cuda = True if args.use_cuda else False use_cuda = True if args.use_cuda else False
parallel = True if args.parallel else False parallel = True if args.parallel else False
print("use_cuda:", use_cuda, "parallel:", parallel) print("use_cuda:", use_cuda, "parallel:", parallel)
...@@ -136,17 +136,16 @@ def train(args): ...@@ -136,17 +136,16 @@ def train(args):
if args.use_cuda: if args.use_cuda:
gpu_num = device[1] gpu_num = device[1]
print("kpis\teach_pass_duration_gpu%s\t%s" % print("kpis\teach_pass_duration_gpu%s\t%s" %
(gpu_num, total_time / epoch_idx)) (gpu_num, total_time / epoch_idx))
print("kpis\ttrain_acc_gpu%s\t%s" % print("kpis\ttrain_acc_gpu%s\t%s" % (gpu_num, ce_acc))
(gpu_num, ce_acc))
else: else:
cpu_num = device[1] cpu_num = device[1]
threads_num = device[2] threads_num = device[2]
print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" % print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" %
(cpu_num, threads_num, total_time / epoch_idx)) (cpu_num, threads_num, total_time / epoch_idx))
print("kpis\ttrain_acc_cpu%s_thread%s\t%s" % print("kpis\ttrain_acc_cpu%s_thread%s\t%s" %
(cpu_num, threads_num, ce_acc)) (cpu_num, threads_num, ce_acc))
def get_device(args): def get_device(args):
if args.use_cuda: if args.use_cuda:
...@@ -157,7 +156,7 @@ def get_device(args): ...@@ -157,7 +156,7 @@ def get_device(args):
threads_num = os.environ.get('NUM_THREADS', 1) threads_num = os.environ.get('NUM_THREADS', 1)
cpu_num = os.environ.get('CPU_NUM', 1) cpu_num = os.environ.get('CPU_NUM', 1)
return "cpu", int(cpu_num), int(threads_num) return "cpu", int(cpu_num), int(threads_num)
def main(): def main():
args = parse_args() args = parse_args()
...@@ -165,4 +164,5 @@ def main(): ...@@ -165,4 +164,5 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
utils.check_version()
main() main()
...@@ -4,10 +4,11 @@ import os ...@@ -4,10 +4,11 @@ import os
import logging import logging
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle import paddle
import io
def get_vocab_size(vocab_path): def get_vocab_size(vocab_path):
with open(vocab_path, "r") as rf: with io.open(vocab_path, "r", encoding='utf-8') as rf:
line = rf.readline() line = rf.readline()
return int(line.strip()) return int(line.strip())
...@@ -16,7 +17,7 @@ def construct_train_data(file_dir, vocab_path, batch_size): ...@@ -16,7 +17,7 @@ def construct_train_data(file_dir, vocab_path, batch_size):
vocab_size = get_vocab_size(vocab_path) vocab_size = get_vocab_size(vocab_path)
files = [file_dir + '/' + f for f in os.listdir(file_dir)] files = [file_dir + '/' + f for f in os.listdir(file_dir)]
y_data = reader.YoochooseDataset(vocab_size) y_data = reader.YoochooseDataset(vocab_size)
train_reader = paddle.batch( train_reader = fluid.io.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
y_data.train(files), buf_size=batch_size * 100), y_data.train(files), buf_size=batch_size * 100),
batch_size=batch_size) batch_size=batch_size)
...@@ -27,10 +28,26 @@ def construct_test_data(file_dir, vocab_path, batch_size): ...@@ -27,10 +28,26 @@ def construct_test_data(file_dir, vocab_path, batch_size):
vocab_size = get_vocab_size(vocab_path) vocab_size = get_vocab_size(vocab_path)
files = [file_dir + '/' + f for f in os.listdir(file_dir)] files = [file_dir + '/' + f for f in os.listdir(file_dir)]
y_data = reader.YoochooseDataset(vocab_size) y_data = reader.YoochooseDataset(vocab_size)
test_reader = paddle.batch(y_data.test(files), batch_size=batch_size) test_reader = fluid.io.batch(y_data.test(files), batch_size=batch_size)
return test_reader, vocab_size return test_reader, vocab_size
def check_version():
"""
Log error and exit when the installed version of paddlepaddle is
not satisfied.
"""
err = "PaddlePaddle version 1.6 or higher is required, " \
"or a suitable develop version is satisfied as well. \n" \
"Please make sure the version is good with your code." \
try:
fluid.require_version('1.6.0')
except Exception as e:
logger.error(err)
sys.exit(1)
def infer_data(raw_data, place): def infer_data(raw_data, place):
data = [dat[0] for dat in raw_data] data = [dat[0] for dat in raw_data]
seq_lens = [len(seq) for seq in data] seq_lens = [len(seq) for seq in data]
......
...@@ -26,6 +26,8 @@ TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Ha ...@@ -26,6 +26,8 @@ TagSpace模型的介绍可以参阅论文[#TagSpace: Semantic Embeddings from Ha
Tagspace模型学习文本及标签的embedding表示,应用于工业级的标签推荐,具体应用场景有feed新闻标签推荐。 Tagspace模型学习文本及标签的embedding表示,应用于工业级的标签推荐,具体应用场景有feed新闻标签推荐。
**Now all models in PaddleRec require PaddlePaddle version 1.6 or higher, or suitable develop version.**
同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122298) 同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122298)
## 数据下载及预处理 ## 数据下载及预处理
......
...@@ -71,6 +71,7 @@ def infer(test_reader, vocab_tag, use_cuda, model_path, epoch): ...@@ -71,6 +71,7 @@ def infer(test_reader, vocab_tag, use_cuda, model_path, epoch):
if __name__ == "__main__": if __name__ == "__main__":
utils.check_version()
args = parse_args() args = parse_args()
start_index = args.start_index start_index = args.start_index
last_index = args.last_index last_index = args.last_index
......
...@@ -2,42 +2,53 @@ import paddle.fluid as fluid ...@@ -2,42 +2,53 @@ import paddle.fluid as fluid
import paddle.fluid.layers.nn as nn import paddle.fluid.layers.nn as nn
import paddle.fluid.layers.tensor as tensor import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf import paddle.fluid.layers.control_flow as cf
import paddle.fluid.layers.io as io
def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1, neg_size=5):
def network(vocab_text_size,
vocab_tag_size,
emb_dim=10,
hid_dim=1000,
win_size=5,
margin=0.1,
neg_size=5):
""" network definition """ """ network definition """
text = io.data(name="text", shape=[1], lod_level=1, dtype='int64') text = fluid.data(name="text", shape=[None, 1], lod_level=1, dtype='int64')
pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64') pos_tag = fluid.data(
neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64') name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64')
text_emb = nn.embedding( neg_tag = fluid.data(
input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb") name="neg_tag", shape=[None, 1], lod_level=1, dtype='int64')
pos_tag_emb = nn.embedding( text_emb = fluid.embedding(
input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb")
neg_tag_emb = nn.embedding( text_emb = fluid.layers.squeeze(input=text_emb, axes=[1])
input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") pos_tag_emb = fluid.embedding(
input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1])
neg_tag_emb = fluid.embedding(
input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb")
neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1])
conv_1d = fluid.nets.sequence_conv_pool( conv_1d = fluid.nets.sequence_conv_pool(
input=text_emb, input=text_emb,
num_filters=hid_dim, num_filters=hid_dim,
filter_size=win_size, filter_size=win_size,
act="tanh", act="tanh",
pool_type="max", pool_type="max",
param_attr="cnn") param_attr="cnn")
text_hid = fluid.layers.fc(input=conv_1d, size=emb_dim, param_attr="text_hid") text_hid = fluid.layers.fc(input=conv_1d,
size=emb_dim,
param_attr="text_hid")
cos_pos = nn.cos_sim(pos_tag_emb, text_hid) cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb) mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb)
mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid) mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid)
cos_neg_all = fluid.layers.sequence_reshape(input=mul_cos_neg, new_dim=neg_size) cos_neg_all = fluid.layers.sequence_reshape(
input=mul_cos_neg, new_dim=neg_size)
#choose max negtive cosine #choose max negtive cosine
cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True) cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True)
#calculate hinge loss #calculate hinge loss
loss_part1 = nn.elementwise_sub( loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like( tensor.fill_constant_batch_size_like(
input=cos_pos, input=cos_pos, shape=[-1, 1], value=margin, dtype='float32'),
shape=[-1, 1], cos_pos)
value=margin,
dtype='float32'),
cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg) loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max( loss_part3 = nn.elementwise_max(
tensor.fill_constant_batch_size_like( tensor.fill_constant_batch_size_like(
......
...@@ -2,8 +2,13 @@ import sys ...@@ -2,8 +2,13 @@ import sys
import six import six
import collections import collections
import os import os
import csv import csv
import re import re
import sys
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
def word_count(column_num, input_file, word_freq=None): def word_count(column_num, input_file, word_freq=None):
""" """
...@@ -13,10 +18,11 @@ def word_count(column_num, input_file, word_freq=None): ...@@ -13,10 +18,11 @@ def word_count(column_num, input_file, word_freq=None):
word_freq = collections.defaultdict(int) word_freq = collections.defaultdict(int)
data_file = csv.reader(input_file) data_file = csv.reader(input_file)
for row in data_file: for row in data_file:
for w in re.split(r'\W+',row[column_num].strip()): for w in re.split(r'\W+', row[column_num].strip()):
word_freq[w]+= 1 word_freq[w] += 1
return word_freq return word_freq
def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""): def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
""" """
Build a word dictionary from the corpus, Keys of the dictionary are words, Build a word dictionary from the corpus, Keys of the dictionary are words,
...@@ -25,11 +31,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""): ...@@ -25,11 +31,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
word_freq = collections.defaultdict(int) word_freq = collections.defaultdict(int)
files = os.listdir(train_dir) files = os.listdir(train_dir)
for fi in files: for fi in files:
with open(train_dir + '/' + fi, "r") as f: with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(column_num, f, word_freq) word_freq = word_count(column_num, f, word_freq)
files = os.listdir(test_dir) files = os.listdir(test_dir)
for fi in files: for fi in files:
with open(test_dir + '/' + fi, "r") as f: with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(column_num, f, word_freq) word_freq = word_count(column_num, f, word_freq)
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq] word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
...@@ -39,13 +45,16 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""): ...@@ -39,13 +45,16 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
return word_idx return word_idx
def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, output_test_dir): def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir,
output_test_dir):
files = os.listdir(train_dir) files = os.listdir(train_dir)
if not os.path.exists(output_train_dir): if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir) os.mkdir(output_train_dir)
for fi in files: for fi in files:
with open(train_dir + '/' + fi, "r") as f: with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open(output_train_dir + '/' + fi, "w") as wf: with open(
os.path.join(output_train_dir, fi), "w",
encoding='utf-8') as wf:
data_file = csv.reader(f) data_file = csv.reader(f)
for row in data_file: for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip()) tag_raw = re.split(r'\W+', row[0].strip())
...@@ -61,8 +70,10 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu ...@@ -61,8 +70,10 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu
if not os.path.exists(output_test_dir): if not os.path.exists(output_test_dir):
os.mkdir(output_test_dir) os.mkdir(output_test_dir)
for fi in files: for fi in files:
with open(test_dir + '/' + fi, "r") as f: with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open(output_test_dir + '/' + fi, "w") as wf: with open(
os.path.join(output_test_dir, fi), "w",
encoding='utf-8') as wf:
data_file = csv.reader(f) data_file = csv.reader(f)
for row in data_file: for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip()) tag_raw = re.split(r'\W+', row[0].strip())
...@@ -74,18 +85,21 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu ...@@ -74,18 +85,21 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu
wf.write(str(w) + " ") wf.write(str(w) + " ")
wf.write("\n") wf.write("\n")
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag):
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab_text, output_vocab_tag):
print("start constuct word dict") print("start constuct word dict")
vocab_text = build_dict(2, 0, train_dir, test_dir) vocab_text = build_dict(2, 0, train_dir, test_dir)
with open(output_vocab_text, "w") as wf: with open(output_vocab_text, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab_text)) + "\n") wf.write(str(len(vocab_text)) + "\n")
vocab_tag = build_dict(0, 0, train_dir, test_dir) vocab_tag = build_dict(0, 0, train_dir, test_dir)
with open(output_vocab_tag, "w") as wf: with open(output_vocab_tag, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab_tag)) + "\n") wf.write(str(len(vocab_tag)) + "\n")
print("construct word dict done\n") print("construct word dict done\n")
write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir, output_test_dir) write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir,
output_test_dir)
train_dir = sys.argv[1] train_dir = sys.argv[1]
...@@ -94,4 +108,5 @@ output_train_dir = sys.argv[3] ...@@ -94,4 +108,5 @@ output_train_dir = sys.argv[3]
output_test_dir = sys.argv[4] output_test_dir = sys.argv[4]
output_vocab_text = sys.argv[5] output_vocab_text = sys.argv[5]
output_vocab_tag = sys.argv[6] output_vocab_tag = sys.argv[6]
text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag) text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab_text, output_vocab_tag)
...@@ -56,8 +56,8 @@ def train(): ...@@ -56,8 +56,8 @@ def train():
""" do training """ """ do training """
args = parse_args() args = parse_args()
if args.enable_ce: if args.enable_ce:
fluid.default_startup_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED
fluid.default_main_program().random_seed = SEED fluid.default_main_program().random_seed = SEED
train_dir = args.train_dir train_dir = args.train_dir
vocab_text_path = args.vocab_text_path vocab_text_path = args.vocab_text_path
vocab_tag_path = args.vocab_tag_path vocab_tag_path = args.vocab_tag_path
...@@ -114,7 +114,8 @@ def train(): ...@@ -114,7 +114,8 @@ def train():
"neg_tag": lod_neg_tag "neg_tag": lod_neg_tag
}, },
fetch_list=[avg_cost.name, correct.name]) fetch_list=[avg_cost.name, correct.name])
ce_info.append(float(np.sum(correct_val)) / (args.num_devices * batch_size)) ce_info.append(
float(np.sum(correct_val)) / (args.num_devices * batch_size))
if batch_id % args.print_batch == 0: if batch_id % args.print_batch == 0:
print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}" print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}"
.format(pass_idx, (batch_id + 10) * batch_size, .format(pass_idx, (batch_id + 10) * batch_size,
...@@ -128,8 +129,7 @@ def train(): ...@@ -128,8 +129,7 @@ def train():
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["text", "pos_tag"] feed_var_names = ["text", "pos_tag"]
fetch_vars = [cos_pos] fetch_vars = [cos_pos]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
exe)
# only for ce # only for ce
if args.enable_ce: if args.enable_ce:
ce_acc = 0 ce_acc = 0
...@@ -142,17 +142,16 @@ def train(): ...@@ -142,17 +142,16 @@ def train():
if args.use_cuda: if args.use_cuda:
gpu_num = device[1] gpu_num = device[1]
print("kpis\teach_pass_duration_gpu%s\t%s" % print("kpis\teach_pass_duration_gpu%s\t%s" %
(gpu_num, total_time / epoch_idx)) (gpu_num, total_time / epoch_idx))
print("kpis\ttrain_acc_gpu%s\t%s" % print("kpis\ttrain_acc_gpu%s\t%s" % (gpu_num, ce_acc))
(gpu_num, ce_acc))
else: else:
cpu_num = device[1] cpu_num = device[1]
threads_num = device[2] threads_num = device[2]
print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" % print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" %
(cpu_num, threads_num, total_time / epoch_idx)) (cpu_num, threads_num, total_time / epoch_idx))
print("kpis\ttrain_acc_cpu%s_thread%s\t%s" % print("kpis\ttrain_acc_cpu%s_thread%s\t%s" %
(cpu_num, threads_num, ce_acc)) (cpu_num, threads_num, ce_acc))
print("finish training") print("finish training")
...@@ -165,7 +164,8 @@ def get_device(args): ...@@ -165,7 +164,8 @@ def get_device(args):
threads_num = os.environ.get('NUM_THREADS', 1) threads_num = os.environ.get('NUM_THREADS', 1)
cpu_num = os.environ.get('CPU_NUM', 1) cpu_num = os.environ.get('CPU_NUM', 1)
return "cpu", int(cpu_num), int(threads_num) return "cpu", int(cpu_num), int(threads_num)
if __name__ == "__main__": if __name__ == "__main__":
utils.check_version()
train() train()
...@@ -8,6 +8,11 @@ import numpy as np ...@@ -8,6 +8,11 @@ import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle import paddle
import csv import csv
import io
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
def to_lodtensor(data, place): def to_lodtensor(data, place):
""" convert to LODtensor """ """ convert to LODtensor """
...@@ -24,12 +29,29 @@ def to_lodtensor(data, place): ...@@ -24,12 +29,29 @@ def to_lodtensor(data, place):
res.set_lod([lod]) res.set_lod([lod])
return res return res
def get_vocab_size(vocab_path): def get_vocab_size(vocab_path):
with open(vocab_path, "r") as rf: with io.open(vocab_path, "r") as rf:
line = rf.readline() line = rf.readline()
return int(line.strip()) return int(line.strip())
def check_version():
"""
Log error and exit when the installed version of paddlepaddle is
not satisfied.
"""
err = "PaddlePaddle version 1.6 or higher is required, " \
"or a suitable develop version is satisfied as well. \n" \
"Please make sure the version is good with your code." \
try:
fluid.require_version('1.6.0')
except Exception as e:
logger.error(err)
sys.exit(1)
def prepare_data(file_dir, def prepare_data(file_dir,
vocab_text_path, vocab_text_path,
vocab_tag_path, vocab_tag_path,
...@@ -45,19 +67,25 @@ def prepare_data(file_dir, ...@@ -45,19 +67,25 @@ def prepare_data(file_dir,
reader = sort_batch( reader = sort_batch(
paddle.reader.shuffle( paddle.reader.shuffle(
train( train(
file_dir, vocab_tag_size, neg_size, file_dir,
buffer_size, data_type=DataType.SEQ), vocab_tag_size,
neg_size,
buffer_size,
data_type=DataType.SEQ),
buf_size=buffer_size), buf_size=buffer_size),
batch_size, batch_size * 20) batch_size,
batch_size * 20)
else: else:
vocab_tag_size = get_vocab_size(vocab_tag_path) vocab_tag_size = get_vocab_size(vocab_tag_path)
vocab_text_size = 0 vocab_text_size = 0
reader = sort_batch( reader = sort_batch(
test( test(
file_dir, vocab_tag_size, buffer_size, data_type=DataType.SEQ), file_dir, vocab_tag_size, buffer_size, data_type=DataType.SEQ),
batch_size, batch_size * 20) batch_size,
batch_size * 20)
return vocab_text_size, vocab_tag_size, reader return vocab_text_size, vocab_tag_size, reader
def sort_batch(reader, batch_size, sort_group_size, drop_last=False): def sort_batch(reader, batch_size, sort_group_size, drop_last=False):
""" """
Create a batched reader. Create a batched reader.
...@@ -107,11 +135,13 @@ def sort_batch(reader, batch_size, sort_group_size, drop_last=False): ...@@ -107,11 +135,13 @@ def sort_batch(reader, batch_size, sort_group_size, drop_last=False):
class DataType(object): class DataType(object):
SEQ = 2 SEQ = 2
def train_reader_creator(file_dir, tag_size, neg_size, n, data_type): def train_reader_creator(file_dir, tag_size, neg_size, n, data_type):
def reader(): def reader():
files = os.listdir(file_dir) files = os.listdir(file_dir)
for fi in files: for fi in files:
with open(file_dir + '/' + fi, "r") as f: with io.open(
os.path.join(file_dir, fi), "r", encoding='utf-8') as f:
for l in f: for l in f:
l = l.strip().split(",") l = l.strip().split(",")
pos_index = int(l[0]) pos_index = int(l[0])
...@@ -123,7 +153,7 @@ def train_reader_creator(file_dir, tag_size, neg_size, n, data_type): ...@@ -123,7 +153,7 @@ def train_reader_creator(file_dir, tag_size, neg_size, n, data_type):
max_iter = 100 max_iter = 100
now_iter = 0 now_iter = 0
sum_n = 0 sum_n = 0
while(sum_n < neg_size) : while (sum_n < neg_size):
now_iter += 1 now_iter += 1
if now_iter > max_iter: if now_iter > max_iter:
print("error : only one class") print("error : only one class")
...@@ -135,13 +165,16 @@ def train_reader_creator(file_dir, tag_size, neg_size, n, data_type): ...@@ -135,13 +165,16 @@ def train_reader_creator(file_dir, tag_size, neg_size, n, data_type):
sum_n += 1 sum_n += 1
if n > 0 and len(text) > n: continue if n > 0 and len(text) > n: continue
yield text, pos_tag, neg_tag yield text, pos_tag, neg_tag
return reader return reader
def test_reader_creator(file_dir, tag_size, n, data_type): def test_reader_creator(file_dir, tag_size, n, data_type):
def reader(): def reader():
files = os.listdir(file_dir) files = os.listdir(file_dir)
for fi in files: for fi in files:
with open(file_dir + '/' + fi, "r") as f: with io.open(
os.path.join(file_dir, fi), "r", encoding='utf-8') as f:
for l in f: for l in f:
l = l.strip().split(",") l = l.strip().split(",")
pos_index = int(l[0]) pos_index = int(l[0])
...@@ -153,11 +186,13 @@ def test_reader_creator(file_dir, tag_size, n, data_type): ...@@ -153,11 +186,13 @@ def test_reader_creator(file_dir, tag_size, n, data_type):
tag = [] tag = []
tag.append(ii) tag.append(ii)
yield text, tag, pos_tag yield text, tag, pos_tag
return reader return reader
def train(train_dir, tag_size, neg_size, n, data_type=DataType.SEQ): def train(train_dir, tag_size, neg_size, n, data_type=DataType.SEQ):
return train_reader_creator(train_dir, tag_size, neg_size, n, data_type) return train_reader_creator(train_dir, tag_size, neg_size, n, data_type)
def test(test_dir, tag_size, n, data_type=DataType.SEQ): def test(test_dir, tag_size, n, data_type=DataType.SEQ):
return test_reader_creator(test_dir, tag_size, n, data_type) return test_reader_creator(test_dir, tag_size, n, data_type)
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
## 介绍 ## 介绍
本例实现了skip-gram模式的word2vector模型。 本例实现了skip-gram模式的word2vector模型。
**目前模型库下模型均要求使用PaddlePaddle 1.6及以上版本或适当的develop版本。**
同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/124377) 同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/124377)
## 数据下载 ## 数据下载
...@@ -36,7 +38,7 @@ mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tok ...@@ -36,7 +38,7 @@ mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tok
```bash ```bash
mkdir data mkdir data
wget https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar
tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar
mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ data/ mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ data/
``` ```
...@@ -45,7 +47,7 @@ mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tok ...@@ -45,7 +47,7 @@ mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tok
```bash ```bash
mkdir data mkdir data
wget https://paddlerec.bj.bcebos.com/word2vec/text.tar wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/text.tar
tar xvf text.tar tar xvf text.tar
mv text data/ mv text data/
``` ```
...@@ -95,7 +97,7 @@ python train.py -h ...@@ -95,7 +97,7 @@ python train.py -h
OPENBLAS_NUM_THREADS=1 CPU_NUM=5 python train.py --train_data_dir data/convert_text8 --dict_path data/test_build_dict --num_passes 10 --batch_size 100 --model_output_dir v1_cpu5_b100_lr1dir --base_lr 1.0 --print_batch 1000 --with_speed --is_sparse OPENBLAS_NUM_THREADS=1 CPU_NUM=5 python train.py --train_data_dir data/convert_text8 --dict_path data/test_build_dict --num_passes 10 --batch_size 100 --model_output_dir v1_cpu5_b100_lr1dir --base_lr 1.0 --print_batch 1000 --with_speed --is_sparse
``` ```
本地单机模拟多机训练 本地单机模拟多机训练, 目前暂不支持windows。
```bash ```bash
sh cluster_train.sh sh cluster_train.sh
...@@ -106,9 +108,9 @@ sh cluster_train.sh ...@@ -106,9 +108,9 @@ sh cluster_train.sh
```bash ```bash
#全量数据集测试集 #全量数据集测试集
wget https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
#样本数据集测试集 #样本数据集测试集
wget https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar
``` ```
预测命令,注意词典名称需要加后缀"_word_to_id_", 此文件是预处理阶段生成的。 预测命令,注意词典名称需要加后缀"_word_to_id_", 此文件是预处理阶段生成的。
......
...@@ -10,6 +10,9 @@ import paddle.fluid as fluid ...@@ -10,6 +10,9 @@ import paddle.fluid as fluid
import paddle import paddle
import net import net
import utils import utils
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
def parse_args(): def parse_args():
...@@ -76,15 +79,12 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w): ...@@ -76,15 +79,12 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w):
for data in test_reader(): for data in test_reader():
step_id += 1 step_id += 1
b_size = len([dat[0] for dat in data]) b_size = len([dat[0] for dat in data])
wa = np.array( wa = np.array([dat[0] for dat in data]).astype(
[dat[0] for dat in data]).astype("int64").reshape( "int64").reshape(b_size)
b_size, 1) wb = np.array([dat[1] for dat in data]).astype(
wb = np.array( "int64").reshape(b_size)
[dat[1] for dat in data]).astype("int64").reshape( wc = np.array([dat[2] for dat in data]).astype(
b_size, 1) "int64").reshape(b_size)
wc = np.array(
[dat[2] for dat in data]).astype("int64").reshape(
b_size, 1)
label = [dat[3] for dat in data] label = [dat[3] for dat in data]
input_word = [dat[4] for dat in data] input_word = [dat[4] for dat in data]
...@@ -93,9 +93,8 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w): ...@@ -93,9 +93,8 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w):
"analogy_a": wa, "analogy_a": wa,
"analogy_b": wb, "analogy_b": wb,
"analogy_c": wc, "analogy_c": wc,
"all_label": "all_label": np.arange(vocab_size)
np.arange(vocab_size).reshape( .reshape(vocab_size).astype("int64"),
vocab_size, 1).astype("int64"),
}, },
fetch_list=[pred.name, values], fetch_list=[pred.name, values],
return_numpy=False) return_numpy=False)
...@@ -143,15 +142,12 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w): ...@@ -143,15 +142,12 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w):
for data in test_reader(): for data in test_reader():
step_id += 1 step_id += 1
b_size = len([dat[0] for dat in data]) b_size = len([dat[0] for dat in data])
wa = np.array( wa = np.array([dat[0] for dat in data]).astype(
[dat[0] for dat in data]).astype("int64").reshape( "int64").reshape(b_size)
b_size, 1) wb = np.array([dat[1] for dat in data]).astype(
wb = np.array( "int64").reshape(b_size)
[dat[1] for dat in data]).astype("int64").reshape( wc = np.array([dat[2] for dat in data]).astype(
b_size, 1) "int64").reshape(b_size)
wc = np.array(
[dat[2] for dat in data]).astype("int64").reshape(
b_size, 1)
label = [dat[3] for dat in data] label = [dat[3] for dat in data]
input_word = [dat[4] for dat in data] input_word = [dat[4] for dat in data]
...@@ -162,7 +158,7 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w): ...@@ -162,7 +158,7 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w):
"analogy_b": wb, "analogy_b": wb,
"analogy_c": wc, "analogy_c": wc,
"all_label": "all_label":
np.arange(vocab_size).reshape(vocab_size, 1), np.arange(vocab_size).reshape(vocab_size),
}, },
fetch_list=[pred.name, values], fetch_list=[pred.name, values],
return_numpy=False) return_numpy=False)
...@@ -185,6 +181,7 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w): ...@@ -185,6 +181,7 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w):
if __name__ == "__main__": if __name__ == "__main__":
utils.check_version()
args = parse_args() args = parse_args()
start_index = args.start_index start_index = args.start_index
last_index = args.last_index last_index = args.last_index
......
...@@ -23,10 +23,10 @@ import paddle.fluid as fluid ...@@ -23,10 +23,10 @@ import paddle.fluid as fluid
def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
datas = [] datas = []
input_word = fluid.layers.data(name="input_word", shape=[1], dtype='int64') input_word = fluid.data(name="input_word", shape=[None, 1], dtype='int64')
true_word = fluid.layers.data(name='true_label', shape=[1], dtype='int64') true_word = fluid.data(name='true_label', shape=[None, 1], dtype='int64')
neg_word = fluid.layers.data( neg_word = fluid.data(
name="neg_label", shape=[neg_num], dtype='int64') name="neg_label", shape=[None, neg_num], dtype='int64')
datas.append(input_word) datas.append(input_word)
datas.append(true_word) datas.append(true_word)
...@@ -37,7 +37,7 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): ...@@ -37,7 +37,7 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
words = fluid.layers.read_file(py_reader) words = fluid.layers.read_file(py_reader)
init_width = 0.5 / embedding_size init_width = 0.5 / embedding_size
input_emb = fluid.layers.embedding( input_emb = fluid.embedding(
input=words[0], input=words[0],
is_sparse=is_sparse, is_sparse=is_sparse,
size=[dict_size, embedding_size], size=[dict_size, embedding_size],
...@@ -45,33 +45,31 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): ...@@ -45,33 +45,31 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
name='emb', name='emb',
initializer=fluid.initializer.Uniform(-init_width, init_width))) initializer=fluid.initializer.Uniform(-init_width, init_width)))
true_emb_w = fluid.layers.embedding( true_emb_w = fluid.embedding(
input=words[1], input=words[1],
is_sparse=is_sparse, is_sparse=is_sparse,
size=[dict_size, embedding_size], size=[dict_size, embedding_size],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='emb_w', initializer=fluid.initializer.Constant(value=0.0))) name='emb_w', initializer=fluid.initializer.Constant(value=0.0)))
true_emb_b = fluid.layers.embedding( true_emb_b = fluid.embedding(
input=words[1], input=words[1],
is_sparse=is_sparse, is_sparse=is_sparse,
size=[dict_size, 1], size=[dict_size, 1],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='emb_b', initializer=fluid.initializer.Constant(value=0.0))) name='emb_b', initializer=fluid.initializer.Constant(value=0.0)))
neg_word_reshape = fluid.layers.reshape(words[2], shape=[-1, 1]) input_emb = fluid.layers.squeeze(input=input_emb, axes=[1])
neg_word_reshape.stop_gradient = True true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1])
true_emb_b = fluid.layers.squeeze(input=true_emb_b, axes=[1])
neg_emb_w = fluid.layers.embedding( neg_emb_w = fluid.embedding(
input=neg_word_reshape, input=words[2],
is_sparse=is_sparse, is_sparse=is_sparse,
size=[dict_size, embedding_size], size=[dict_size, embedding_size],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='emb_w', learning_rate=1.0)) name='emb_w', learning_rate=1.0))
neg_emb_b = fluid.embedding(
neg_emb_w_re = fluid.layers.reshape( input=words[2],
neg_emb_w, shape=[-1, neg_num, embedding_size])
neg_emb_b = fluid.layers.embedding(
input=neg_word_reshape,
is_sparse=is_sparse, is_sparse=is_sparse,
size=[dict_size, 1], size=[dict_size, 1],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
...@@ -86,8 +84,7 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): ...@@ -86,8 +84,7 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
true_emb_b) true_emb_b)
input_emb_re = fluid.layers.reshape( input_emb_re = fluid.layers.reshape(
input_emb, shape=[-1, 1, embedding_size]) input_emb, shape=[-1, 1, embedding_size])
neg_matmul = fluid.layers.matmul( neg_matmul = fluid.layers.matmul(input_emb_re, neg_emb_w, transpose_y=True)
input_emb_re, neg_emb_w_re, transpose_y=True)
neg_matmul_re = fluid.layers.reshape(neg_matmul, shape=[-1, neg_num]) neg_matmul_re = fluid.layers.reshape(neg_matmul, shape=[-1, neg_num])
neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec) neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec)
#nce loss #nce loss
...@@ -111,22 +108,18 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): ...@@ -111,22 +108,18 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
def infer_network(vocab_size, emb_size): def infer_network(vocab_size, emb_size):
analogy_a = fluid.layers.data(name="analogy_a", shape=[1], dtype='int64') analogy_a = fluid.data(name="analogy_a", shape=[None], dtype='int64')
analogy_b = fluid.layers.data(name="analogy_b", shape=[1], dtype='int64') analogy_b = fluid.data(name="analogy_b", shape=[None], dtype='int64')
analogy_c = fluid.layers.data(name="analogy_c", shape=[1], dtype='int64') analogy_c = fluid.data(name="analogy_c", shape=[None], dtype='int64')
all_label = fluid.layers.data( all_label = fluid.data(name="all_label", shape=[vocab_size], dtype='int64')
name="all_label", emb_all_label = fluid.embedding(
shape=[vocab_size, 1],
dtype='int64',
append_batch_size=False)
emb_all_label = fluid.layers.embedding(
input=all_label, size=[vocab_size, emb_size], param_attr="emb") input=all_label, size=[vocab_size, emb_size], param_attr="emb")
emb_a = fluid.layers.embedding( emb_a = fluid.embedding(
input=analogy_a, size=[vocab_size, emb_size], param_attr="emb") input=analogy_a, size=[vocab_size, emb_size], param_attr="emb")
emb_b = fluid.layers.embedding( emb_b = fluid.embedding(
input=analogy_b, size=[vocab_size, emb_size], param_attr="emb") input=analogy_b, size=[vocab_size, emb_size], param_attr="emb")
emb_c = fluid.layers.embedding( emb_c = fluid.embedding(
input=analogy_c, size=[vocab_size, emb_size], param_attr="emb") input=analogy_c, size=[vocab_size, emb_size], param_attr="emb")
target = fluid.layers.elementwise_add( target = fluid.layers.elementwise_add(
fluid.layers.elementwise_sub(emb_b, emb_a), emb_c) fluid.layers.elementwise_sub(emb_b, emb_a), emb_c)
......
...@@ -6,6 +6,10 @@ import six ...@@ -6,6 +6,10 @@ import six
import argparse import argparse
import io import io
import math import math
import sys
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
prog = re.compile("[^a-z ]", flags=0) prog = re.compile("[^a-z ]", flags=0)
...@@ -110,10 +114,14 @@ def filter_corpus(args): ...@@ -110,10 +114,14 @@ def filter_corpus(args):
if not os.path.exists(args.output_corpus_dir): if not os.path.exists(args.output_corpus_dir):
os.makedirs(args.output_corpus_dir) os.makedirs(args.output_corpus_dir)
for file in os.listdir(args.input_corpus_dir): for file in os.listdir(args.input_corpus_dir):
with io.open(args.output_corpus_dir + '/convert_' + file, "w") as wf: with io.open(
os.path.join(args.output_corpus_dir, 'convert_' + file),
"w",
encoding='utf-8') as wf:
with io.open( with io.open(
args.input_corpus_dir + '/' + file, encoding='utf-8') as rf: os.path.join(args.input_corpus_dir, file),
print(args.input_corpus_dir + '/' + file) encoding='utf-8') as rf:
print(os.path.join(args.input_corpus_dir, file))
for line in rf: for line in rf:
signal = False signal = False
line = text_strip(line) line = text_strip(line)
......
...@@ -12,6 +12,12 @@ import six ...@@ -12,6 +12,12 @@ import six
import reader import reader
from net import skip_gram_word2vec from net import skip_gram_word2vec
import utils
import sys
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid") logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
...@@ -224,5 +230,6 @@ def train(args): ...@@ -224,5 +230,6 @@ def train(args):
if __name__ == '__main__': if __name__ == '__main__':
utils.check_version()
args = parse_args() args = parse_args()
train(args) train(args)
...@@ -7,12 +7,13 @@ import paddle.fluid as fluid ...@@ -7,12 +7,13 @@ import paddle.fluid as fluid
import paddle import paddle
import os import os
import preprocess import preprocess
import io
def BuildWord_IdMap(dict_path): def BuildWord_IdMap(dict_path):
word_to_id = dict() word_to_id = dict()
id_to_word = dict() id_to_word = dict()
with open(dict_path, 'r') as f: with io.open(dict_path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
word_to_id[line.split(' ')[0]] = int(line.split(' ')[1]) word_to_id[line.split(' ')[0]] = int(line.split(' ')[1])
id_to_word[int(line.split(' ')[1])] = line.split(' ')[0] id_to_word[int(line.split(' ')[1])] = line.split(' ')[0]
...@@ -22,10 +23,26 @@ def BuildWord_IdMap(dict_path): ...@@ -22,10 +23,26 @@ def BuildWord_IdMap(dict_path):
def prepare_data(file_dir, dict_path, batch_size): def prepare_data(file_dir, dict_path, batch_size):
w2i, i2w = BuildWord_IdMap(dict_path) w2i, i2w = BuildWord_IdMap(dict_path)
vocab_size = len(i2w) vocab_size = len(i2w)
reader = paddle.batch(test(file_dir, w2i), batch_size) reader = fluid.io.batch(test(file_dir, w2i), batch_size)
return vocab_size, reader, i2w return vocab_size, reader, i2w
def check_version():
"""
Log error and exit when the installed version of paddlepaddle is
not satisfied.
"""
err = "PaddlePaddle version 1.6 or higher is required, " \
"or a suitable develop version is satisfied as well. \n" \
"Please make sure the version is good with your code." \
try:
fluid.require_version('1.6.0')
except Exception as e:
logger.error(err)
sys.exit(1)
def native_to_unicode(s): def native_to_unicode(s):
if _is_unicode(s): if _is_unicode(s):
return s return s
...@@ -75,7 +92,8 @@ def reader_creator(file_dir, word_to_id): ...@@ -75,7 +92,8 @@ def reader_creator(file_dir, word_to_id):
def reader(): def reader():
files = os.listdir(file_dir) files = os.listdir(file_dir)
for fi in files: for fi in files:
with open(file_dir + '/' + fi, "r") as f: with io.open(
os.path.join(file_dir, fi), "r", encoding='utf-8') as f:
for line in f: for line in f:
if ':' in line: if ':' in line:
pass pass
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册