未验证 提交 c08c5c5d 编写于 作者: Z zhang wenhui 提交者: GitHub

Local develop (#4007)

* cherry-pick 1.6 to develop

* fix outdated layers.nn api

* fix outdated layers.nn api

* fix open utf8
上级 d267b76b
......@@ -30,11 +30,11 @@ def build_dict(min_word_freq=0, train_dir="", test_dir=""):
word_freq = collections.defaultdict(int)
files = os.listdir(train_dir)
for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(train_dir, fi), "r") as f:
word_freq = word_count(f, word_freq)
files = os.listdir(test_dir)
for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(test_dir, fi), "r") as f:
word_freq = word_count(f, word_freq)
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
......@@ -50,10 +50,8 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir,
if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir)
for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open(
os.path.join(output_train_dir, fi), "w",
encoding='utf-8') as wf:
with open(os.path.join(train_dir, fi), "r") as f:
with open(os.path.join(output_train_dir, fi), "w") as wf:
for l in f:
l = l.strip().split()
l = [word_idx.get(w) for w in l]
......@@ -65,10 +63,8 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir,
if not os.path.exists(output_test_dir):
os.mkdir(output_test_dir)
for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open(
os.path.join(output_test_dir, fi), "w",
encoding='utf-8') as wf:
with open(os.path.join(test_dir, fi), "r") as f:
with open(os.path.join(output_test_dir, fi), "w") as wf:
for l in f:
l = l.strip().split()
l = [word_idx.get(w) for w in l]
......
......@@ -13,7 +13,6 @@
# limitations under the License.
import paddle.fluid as fluid
import paddle.fluid.layers.nn as nn
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
import paddle.fluid.layers.io as io
......@@ -26,7 +25,7 @@ class BowEncoder(object):
self.param_name = ""
def forward(self, emb):
return nn.sequence_pool(input=emb, pool_type='sum')
return fluid.layers.sequence_pool(input=emb, pool_type='sum')
class CNNEncoder(object):
......@@ -63,18 +62,18 @@ class GrnnEncoder(object):
self.hidden_size = hidden_size
def forward(self, emb):
fc0 = nn.fc(input=emb,
size=self.hidden_size * 3,
param_attr=self.param_name + "_fc.w",
bias_attr=False)
fc0 = fluid.layers.fc(input=emb,
size=self.hidden_size * 3,
param_attr=self.param_name + "_fc.w",
bias_attr=False)
gru_h = nn.dynamic_gru(
gru_h = fluid.layers.dynamic_gru(
input=fc0,
size=self.hidden_size,
is_reverse=False,
param_attr=self.param_name + ".param",
bias_attr=self.param_name + ".bias")
return nn.sequence_pool(input=gru_h, pool_type='max')
return fluid.layers.sequence_pool(input=gru_h, pool_type='max')
'''this is a very simple Encoder factory
......@@ -117,7 +116,7 @@ class MultiviewSimnet(object):
def get_correct(self, x, y):
less = tensor.cast(cf.less_than(x, y), dtype='float32')
correct = nn.reduce_sum(less)
correct = fluid.layers.reduce_sum(less)
return correct
def train_net(self):
......@@ -167,30 +166,30 @@ class MultiviewSimnet(object):
]
# concat multi view for query, pos_title, neg_title
q_concat = nn.concat(q_encodes)
pt_concat = nn.concat(pt_encodes)
nt_concat = nn.concat(nt_encodes)
q_concat = fluid.layers.concat(q_encodes)
pt_concat = fluid.layers.concat(pt_encodes)
nt_concat = fluid.layers.concat(nt_encodes)
# projection of hidden layer
q_hid = nn.fc(q_concat,
size=self.hidden_size,
param_attr='q_fc.w',
bias_attr='q_fc.b')
pt_hid = nn.fc(pt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
nt_hid = nn.fc(nt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
q_hid = fluid.layers.fc(q_concat,
size=self.hidden_size,
param_attr='q_fc.w',
bias_attr='q_fc.b')
pt_hid = fluid.layers.fc(pt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
nt_hid = fluid.layers.fc(nt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
# cosine of hidden layers
cos_pos = nn.cos_sim(q_hid, pt_hid)
cos_neg = nn.cos_sim(q_hid, nt_hid)
cos_pos = fluid.layers.cos_sim(q_hid, pt_hid)
cos_neg = fluid.layers.cos_sim(q_hid, nt_hid)
# pairwise hinge_loss
loss_part1 = nn.elementwise_sub(
loss_part1 = fluid.layers.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=cos_pos,
shape=[-1, 1],
......@@ -198,14 +197,14 @@ class MultiviewSimnet(object):
dtype='float32'),
cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part2 = fluid.layers.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max(
loss_part3 = fluid.layers.elementwise_max(
tensor.fill_constant_batch_size_like(
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
avg_cost = nn.mean(loss_part3)
avg_cost = fluid.layers.mean(loss_part3)
correct = self.get_correct(cos_neg, cos_pos)
return q_slots + pt_slots + nt_slots, avg_cost, correct
......@@ -240,17 +239,18 @@ class MultiviewSimnet(object):
self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs)
]
# concat multi view for query, pos_title, neg_title
q_concat = nn.concat(q_encodes)
pt_concat = nn.concat(pt_encodes)
q_concat = fluid.layers.concat(q_encodes)
pt_concat = fluid.layers.concat(pt_encodes)
# projection of hidden layer
q_hid = nn.fc(q_concat,
size=self.hidden_size,
param_attr='q_fc.w',
bias_attr='q_fc.b')
pt_hid = nn.fc(pt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
q_hid = fluid.layers.fc(q_concat,
size=self.hidden_size,
param_attr='q_fc.w',
bias_attr='q_fc.b')
pt_hid = fluid.layers.fc(pt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
# cosine of hidden layers
cos = nn.cos_sim(q_hid, pt_hid)
cos = fluid.layers.cos_sim(q_hid, pt_hid)
return cos
......@@ -26,7 +26,7 @@ class BowEncoder(object):
self.param_name = ""
def forward(self, emb):
return nn.sequence_pool(input=emb, pool_type='sum')
return fluid.layers.sequence_pool(input=emb, pool_type='sum')
class GrnnEncoder(object):
......@@ -37,18 +37,18 @@ class GrnnEncoder(object):
self.hidden_size = hidden_size
def forward(self, emb):
fc0 = nn.fc(input=emb,
size=self.hidden_size * 3,
param_attr=self.param_name + "_fc.w",
bias_attr=False)
fc0 = fluid.layers.fc(input=emb,
size=self.hidden_size * 3,
param_attr=self.param_name + "_fc.w",
bias_attr=False)
gru_h = nn.dynamic_gru(
gru_h = fluid.layers.dynamic_gru(
input=fc0,
size=self.hidden_size,
is_reverse=False,
param_attr=self.param_name + ".param",
bias_attr=self.param_name + ".bias")
return nn.sequence_pool(input=gru_h, pool_type='max')
return fluid.layers.sequence_pool(input=gru_h, pool_type='max')
class PairwiseHingeLoss(object):
......@@ -56,12 +56,12 @@ class PairwiseHingeLoss(object):
self.margin = margin
def forward(self, pos, neg):
loss_part1 = nn.elementwise_sub(
loss_part1 = fluid.layers.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=pos, shape=[-1, 1], value=self.margin, dtype='float32'),
pos)
loss_part2 = nn.elementwise_add(loss_part1, neg)
loss_part3 = nn.elementwise_max(
loss_part2 = fluid.layers.elementwise_add(loss_part1, neg)
loss_part3 = fluid.layers.elementwise_max(
tensor.fill_constant_batch_size_like(
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
......@@ -82,7 +82,7 @@ class SequenceSemanticRetrieval(object):
def get_correct(self, x, y):
less = tensor.cast(cf.less_than(x, y), dtype='float32')
correct = nn.reduce_sum(less)
correct = fluid.layers.reduce_sum(less)
return correct
def train(self):
......@@ -101,22 +101,22 @@ class SequenceSemanticRetrieval(object):
user_enc = self.user_encoder.forward(user_emb)
pos_item_enc = self.item_encoder.forward(pos_item_emb)
neg_item_enc = self.item_encoder.forward(neg_item_emb)
user_hid = nn.fc(input=user_enc,
size=self.hidden_size,
param_attr='user.w',
bias_attr="user.b")
pos_item_hid = nn.fc(input=pos_item_enc,
size=self.hidden_size,
param_attr='item.w',
bias_attr="item.b")
neg_item_hid = nn.fc(input=neg_item_enc,
size=self.hidden_size,
param_attr='item.w',
bias_attr="item.b")
cos_pos = nn.cos_sim(user_hid, pos_item_hid)
cos_neg = nn.cos_sim(user_hid, neg_item_hid)
user_hid = fluid.layers.fc(input=user_enc,
size=self.hidden_size,
param_attr='user.w',
bias_attr="user.b")
pos_item_hid = fluid.layers.fc(input=pos_item_enc,
size=self.hidden_size,
param_attr='item.w',
bias_attr="item.b")
neg_item_hid = fluid.layers.fc(input=neg_item_enc,
size=self.hidden_size,
param_attr='item.w',
bias_attr="item.b")
cos_pos = fluid.layers.cos_sim(user_hid, pos_item_hid)
cos_neg = fluid.layers.cos_sim(user_hid, neg_item_hid)
hinge_loss = self.pairwise_hinge_loss.forward(cos_pos, cos_neg)
avg_cost = nn.mean(hinge_loss)
avg_cost = fluid.layers.mean(hinge_loss)
correct = self.get_correct(cos_neg, cos_pos)
return [user_data, pos_item_data,
......
......@@ -44,6 +44,8 @@ Tagspace模型学习文本及标签的embedding表示,应用于工业级的标
备份数据解压后,将文本数据转为paddle数据,先将数据放到训练数据目录和测试数据目录
```
mkdir raw_big_train_data
mkdir raw_big_test_data
mv train.csv raw_big_train_data
mv test.csv raw_big_test_data
```
......
......@@ -3,6 +3,7 @@ import six
import collections
import os
import csv
import io
import re
import sys
if six.PY2:
......@@ -31,11 +32,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
word_freq = collections.defaultdict(int)
files = os.listdir(train_dir)
for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with io.open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(column_num, f, word_freq)
files = os.listdir(test_dir)
for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with io.open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(column_num, f, word_freq)
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
......@@ -51,10 +52,8 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir,
if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir)
for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open(
os.path.join(output_train_dir, fi), "w",
encoding='utf-8') as wf:
with open(os.path.join(train_dir, fi), "r") as f:
with open(os.path.join(output_train_dir, fi), "w") as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
......@@ -70,10 +69,8 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir,
if not os.path.exists(output_test_dir):
os.mkdir(output_test_dir)
for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open(
os.path.join(output_test_dir, fi), "w",
encoding='utf-8') as wf:
with open(os.path.join(test_dir, fi), "r") as f:
with open(os.path.join(output_test_dir, fi), "w") as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
......@@ -90,12 +87,13 @@ def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab_text, output_vocab_tag):
print("start constuct word dict")
vocab_text = build_dict(2, 0, train_dir, test_dir)
with open(output_vocab_text, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab_text)) + "\n")
with io.open(output_vocab_text, "w", encoding='utf-8') as wf:
wf.write((str(len(vocab_text)) + "\n").decode('utf-8'))
vocab_tag = build_dict(0, 0, train_dir, test_dir)
with open(output_vocab_tag, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab_tag)) + "\n")
with io.open(output_vocab_tag, "w", encoding='utf-8') as wf:
wf.write((str(len(vocab_tag)) + "\n").decode('utf-8'))
print("construct word dict done\n")
write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册