提交 40c2beda 编写于 作者: M malin10

fix_code_style

上级 edec99eb
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.match.dssm"
dataset:
......
......@@ -27,17 +27,21 @@ class Model(ModelBase):
self.Neg = envs.get_global_env("hyper_parameters.NEG")
self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes")
self.hidden_acts = envs.get_global_env("hyper_parameters.fc_acts")
self.learning_rate = envs.get_global_env("hyper_parameters.learning_rate")
self.learning_rate = envs.get_global_env(
"hyper_parameters.learning_rate")
def input_data(self, is_infer=False, **kwargs):
query = fluid.data(
name="query", shape=[-1, self.TRIGRAM_D], dtype='float32', lod_level=0)
name="query",
shape=[-1, self.TRIGRAM_D],
dtype='float32',
lod_level=0)
doc_pos = fluid.data(
name="doc_pos",
shape=[-1, self.TRIGRAM_D],
dtype='float32',
lod_level=0)
if is_infer:
return [query, doc_pos]
......@@ -78,14 +82,14 @@ class Model(ModelBase):
return
R_Q_D_ns = []
for i in range(len(inputs)-2):
doc_neg_fc_i = fc(inputs[i+2], self.hidden_layers, self.hidden_acts, [
'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
'doc_neg_l3_' + str(i)
])
for i in range(len(inputs) - 2):
doc_neg_fc_i = fc(
inputs[i + 2], self.hidden_layers, self.hidden_acts, [
'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
'doc_neg_l3_' + str(i)
])
R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i))
concat_Rs = fluid.layers.concat(
input=[R_Q_D_p] + R_Q_D_ns, axis=-1)
concat_Rs = fluid.layers.concat(input=[R_Q_D_p] + R_Q_D_ns, axis=-1)
prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice(
......@@ -94,4 +98,3 @@ class Model(ModelBase):
avg_cost = fluid.layers.mean(x=loss)
self._cost = avg_cost
self._metrics["LOSS"] = avg_cost
# DSSM
## 简介
DSSM[《Learning Deep Structured Semantic Models for Web Search using Clickthrough Data》]( https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf )即基于深度网络的语义模型,其核心思想是将query和doc映射到共同维度的语义空间中,通过最大化query和doc语义向量之间的余弦相似度,从而训练得到隐含语义模型,达到检索的目的,并通过word hashing方法来减少输入向量的维度。DSSM有很广泛的应用,比如:搜索引擎检索,广告相关性,问答系统,机器翻译等。
本项目按照论文的网络结构在paddlepaddle上实现DSSM模型,并构造数据集验证网络的正确性。
## 模型超参
```
optimizer:
class: sgd # 优化器
learning_rate: 0.01 # 学习率
strategy: async # 参数更新方式
TRIGRAM_D: 1000 # query和doc语义向量长度
NEG: 4 # 负采样个数
fc_sizes: [300, 300, 128] # fc层大小
fc_acts: ['tanh', 'tanh', 'tanh'] # fc层激活函数
```
## 快速开始
PaddleRec内置了demo小数据,方便用户快速使用模型,训练命令如下:
```bash
python -m paddlerec.run -m paddlerec.models.match.dssm
```
执行预测前,需更改config.yaml中的配置,具体改动如下:
```
workspace: "~/code/paddlerec/models/match/dssm" # 改为当前config.yaml所在的绝对路径
#mode: runner1 # train
mode: runner2 # infer
runner:
- name: runner2
class: single_infer
init_model_path: "increment/2" # 改为需要预测的模型路径
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: dataset_infer # 改成预测dataset
thread_num: 1 # dataset线程数
```
改完之后,执行预测命令:
```
python -m paddlerec.run -m ./config.yaml
```
## 提测说明
当前,DSSM模型采用的数据集是随机构造的,因此提测仅需按上述步骤在demo数据集上跑通即可。
......@@ -16,7 +16,7 @@ from __future__ import print_function
from paddlerec.core.reader import Reader
class EvaluateReader(Reader):
class TrainReader(Reader):
def init(self):
pass
......
......@@ -101,12 +101,17 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def _init_hyper_parameters(self):
self.query_encoder = envs.get_global_env("hyper_parameters.query_encoder")
self.title_encoder = envs.get_global_env("hyper_parameters.title_encoder")
self.query_encode_dim = envs.get_global_env("hyper_parameters.query_encode_dim")
self.title_encode_dim = envs.get_global_env("hyper_parameters.title_encode_dim")
self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim")
self.query_encoder = envs.get_global_env(
"hyper_parameters.query_encoder")
self.title_encoder = envs.get_global_env(
"hyper_parameters.title_encoder")
self.query_encode_dim = envs.get_global_env(
"hyper_parameters.query_encode_dim")
self.title_encode_dim = envs.get_global_env(
"hyper_parameters.title_encode_dim")
self.emb_size = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim")
self.emb_shape = [self.emb_size, self.emb_dim]
......@@ -125,62 +130,61 @@ class Model(ModelBase):
input=query, size=self.emb_shape, param_attr="emb")
for query in self.q_slots
]
# encode each embedding field with encoder
# encode each embedding field with encoder
q_encodes = [
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
]
# concat multi view for query, pos_title, neg_title
q_concat = fluid.layers.concat(q_encodes)
# concat multi view for query, pos_title, neg_title
q_concat = fluid.layers.concat(q_encodes)
# projection of hidden layer
q_hid = fluid.layers.fc(q_concat,
q_hid = fluid.layers.fc(q_concat,
size=self.hidden_size,
param_attr='q_fc.w',
bias_attr='q_fc.b')
self.pt_slots = self._sparse_data_var[1:2]
self.title_encoders = [
factory.create(self.title_encoder, self.title_encode_dim)
]
pt_embs = [
pt_embs = [
fluid.embedding(
input=title, size=self.emb_shape, param_attr="emb")
for title in self.pt_slots
]
pt_encodes = [
pt_encodes = [
self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
]
pt_concat = fluid.layers.concat(pt_encodes)
pt_hid = fluid.layers.fc(pt_concat,
pt_concat = fluid.layers.concat(pt_encodes)
pt_hid = fluid.layers.fc(pt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
# cosine of hidden layers
cos_pos = fluid.layers.cos_sim(q_hid, pt_hid)
# cosine of hidden layers
cos_pos = fluid.layers.cos_sim(q_hid, pt_hid)
if is_infer:
self._infer_results['query_pt_sim'] = cos_pos
return
self._infer_results['query_pt_sim'] = cos_pos
return
self.nt_slots = self._sparse_data_var[2:3]
nt_embs = [
nt_embs = [
fluid.embedding(
input=title, size=self.emb_shape, param_attr="emb")
for title in self.nt_slots
]
nt_encodes = [
nt_encodes = [
self.title_encoders[i].forward(emb)
for i, emb in enumerate(nt_embs)
]
nt_concat = fluid.layers.concat(nt_encodes)
nt_hid = fluid.layers.fc(nt_concat,
nt_concat = fluid.layers.concat(nt_encodes)
nt_hid = fluid.layers.fc(nt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
cos_neg = fluid.layers.cos_sim(q_hid, nt_hid)
cos_neg = fluid.layers.cos_sim(q_hid, nt_hid)
# pairwise hinge_loss
# pairwise hinge_loss
loss_part1 = fluid.layers.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=cos_pos,
......@@ -198,7 +202,7 @@ class Model(ModelBase):
self._cost = fluid.layers.mean(loss_part3)
self.acc = self.get_acc(cos_neg, cos_pos)
self._metrics["loss"] = self._cost
self._metrics["loss"] = self._cost
self._metrics["acc"] = self.acc
def get_acc(self, x, y):
......
# Multi-view Simnet for Personalized recommendation
## 简介
在个性化推荐场景中,推荐系统给用户提供的项目(Item)列表通常是通过个性化的匹配模型计算出来的。在现实世界中,一个用户可能有很多个视角的特征,比如用户Id,年龄,项目的点击历史等。一个项目,举例来说,新闻资讯,也会有多种视角的特征比如新闻标题,新闻类别等。Multi-view Simnet模型是可以融合用户以及推荐项目的多个视角的特征并进行个性化匹配学习的一体化模型。这类模型在很多工业化的场景中都会被使用到,比如百度的Feed产品中。
本项目的目标是提供一个在个性化匹配场景下利用Paddle搭建的模型。Multi-view Simnet模型包括多个编码器模块,每个编码器被用在不同的特征视角上。当前,项目中提供Bag-of-Embedding编码器,Temporal-Convolutional编码器,和Gated-Recurrent-Unit编码器。我们会逐渐加入稀疏特征场景下比较实用的编码器到这个项目中。模型的训练方法,当前采用的是Pairwise ranking模式进行训练,即针对一对具有关联的User-Item组合,并随机产出一个Item作为负例进行排序学习。
## 模型超参
```
optimizer:
class: Adam # 优化器类型
learning_rate: 0.0001 # 学习率
strategy: async # 参数更新方式
query_encoder: "bow" # 用户特征编码器
title_encoder: "bow" # item特征编码器
query_encode_dim: 128 # 用户编码器产出的特征维度
title_encode_dim: 128 # item编码器产出的特征维度
sparse_feature_dim: 1000001 # 用户特征及item特征,所有特征总个数
embedding_dim: 128 # 特征维度
hidden_size: 128 # 隐藏层维度
margin: 0.1 # max margin for hinge-loss
```
## 快速开始
PaddleRec内置了demo小数据,方便用户快速使用模型,训练命令如下:
```bash
python -m paddlerec.run -m paddlerec.models.match.multiview-simnet
```
执行预测前,需更改config.yaml中的配置,具体改动如下:
```
workspace: "~/code/paddlerec/models/match/multiview-simnet" # 改为当前config.yaml所在的绝对路径
#mode: runner1 # train
mode: runner2 # infer
runner:
- name: runner2
class: single_infer
init_model_path: "increment/2" # 改为需要预测的模型路径
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: dataset_infer # 改成预测dataset
thread_num: 1 # dataset线程数
```
改完之后,执行预测命令:
```
python -m paddlerec.run -m ./config.yaml
```
## 提测说明
当前,Multi-view Simnet模型采用的数据集是机器随机构造的,因此提测仅需按上述步骤在demo数据集上跑通即可。
......@@ -22,9 +22,10 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
dict_path = envs.get_global_env("dataset.dataset_infer.word_id_dict_path")
self.min_n = envs.get_global_env("hyper_parameters.min_n")
self.max_n = envs.get_global_env("hyper_parameters.max_n")
dict_path = envs.get_global_env(
"dataset.dataset_infer.word_id_dict_path")
self.min_n = envs.get_global_env("hyper_parameters.min_n")
self.max_n = envs.get_global_env("hyper_parameters.max_n")
self.word_to_id = dict()
self.id_to_word = dict()
with io.open(dict_path, 'r', encoding='utf-8') as f:
......@@ -78,7 +79,8 @@ class TrainReader(Reader):
a unicode string - a space-delimited sequence of words.
"""
return u" ".join([
"<" + word + ">" if "<" + word + ">" in original_vocab else u"<UNK>"
"<" + word + ">"
if "<" + word + ">" in original_vocab else u"<UNK>"
for word in line.split()
])
......@@ -99,9 +101,7 @@ class TrainReader(Reader):
res.append(self.word_to_id[_])
inputs.append(res)
print(inputs)
yield [('analogy_a', inputs[0]),
('analogy_b', inputs[1]),
('analogy_c', inputs[2]),
('analogy_d', inputs[3][0:1])]
yield [('analogy_a', inputs[0]), ('analogy_b', inputs[1]),
('analogy_c', inputs[2]), ('analogy_d', inputs[3][0:1])]
return reader
......@@ -24,27 +24,33 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def _init_hyper_parameters(self):
self.is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
self.sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number")
self.sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim")
self.is_distributed = True if envs.get_trainer(
) == "CtrTrainer" else False
self.sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number")
self.sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.with_shuffle_batch = envs.get_global_env("hyper_parameters.with_shuffle_batch")
self.learning_rate = envs.get_global_env("hyper_parameters.optimizer.learning_rate")
self.decay_steps = envs.get_global_env("hyper_parameters.optimizer.decay_steps")
self.decay_rate = envs.get_global_env("hyper_parameters.optimizer.decay_rate")
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch")
self.learning_rate = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")
self.decay_steps = envs.get_global_env(
"hyper_parameters.optimizer.decay_steps")
self.decay_rate = envs.get_global_env(
"hyper_parameters.optimizer.decay_rate")
def input_data(self, is_infer=False, **kwargs):
if is_infer:
analogy_a = fluid.data(
name="analogy_a", shape=[None, 1], lod_level=1, dtype='int64')
name="analogy_a", shape=[None, 1], lod_level=1, dtype='int64')
analogy_b = fluid.data(
name="analogy_b", shape=[None, 1], lod_level=1, dtype='int64')
analogy_c = fluid.data(
name="analogy_c", shape=[None, 1], lod_level=1, dtype='int64')
analogy_d = fluid.data(
name="analogy_d", shape=[None, 1], dtype='int64')
return [analogy_a, analogy_b, analogy_c, analogy_d]
return [analogy_a, analogy_b, analogy_c, analogy_d]
input_word = fluid.data(
name="input_word", shape=[None, 1], lod_level=1, dtype='int64')
......@@ -59,10 +65,10 @@ class Model(ModelBase):
def net(self, inputs, is_infer=False):
if is_infer:
self.infer_net(inputs)
return
self.infer_net(inputs)
return
def embedding_layer(input,
def embedding_layer(input,
table_name,
initializer_instance=None,
sequence_pool=False):
......@@ -74,7 +80,8 @@ class Model(ModelBase):
param_attr=fluid.ParamAttr(
name=table_name, initializer=initializer_instance), )
if sequence_pool:
emb = fluid.layers.sequence_pool(input=emb, pool_type='average')
emb = fluid.layers.sequence_pool(
input=emb, pool_type='average')
return emb
init_width = 1.0 / self.sparse_feature_dim
......@@ -83,10 +90,10 @@ class Model(ModelBase):
input_emb = embedding_layer(inputs[0], "emb", emb_initializer, True)
input_emb = fluid.layers.squeeze(input=input_emb, axes=[1])
true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer, True)
true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer,
True)
true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1])
if self.with_shuffle_batch:
neg_emb_w_list = []
for i in range(self.neg_num):
......@@ -95,7 +102,8 @@ class Model(ModelBase):
true_emb_w)) # shuffle true_word
neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
neg_emb_w = fluid.layers.reshape(
neg_emb_w_concat, shape=[-1, self.neg_num, self.sparse_feature_dim])
neg_emb_w_concat,
shape=[-1, self.neg_num, self.sparse_feature_dim])
else:
neg_emb_w = embedding_layer(inputs[2], "emb_w", emb_w_initializer)
true_logits = fluid.layers.reduce_sum(
......@@ -107,8 +115,7 @@ class Model(ModelBase):
input_emb, shape=[-1, 1, self.sparse_feature_dim])
neg_matmul = fluid.layers.matmul(
input_emb_re, neg_emb_w, transpose_y=True)
neg_logits = fluid.layers.reshape(
neg_matmul, shape=[-1, 1])
neg_logits = fluid.layers.reshape(neg_matmul, shape=[-1, 1])
logits = fluid.layers.concat([true_logits, neg_logits], axis=0)
label_ones = fluid.layers.fill_constant(
......@@ -120,13 +127,12 @@ class Model(ModelBase):
value=0.0,
dtype='float32')
label = fluid.layers.concat([label_ones, label_zeros], axis=0)
loss = fluid.layers.log_loss(fluid.layers.sigmoid(logits), label)
avg_cost = fluid.layers.reduce_sum(loss)
self._cost = avg_cost
self._metrics["LOSS"] = avg_cost
def optimizer(self):
optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
......@@ -137,13 +143,17 @@ class Model(ModelBase):
return optimizer
def infer_net(self, inputs):
def embedding_layer(input, table_name, initializer_instance=None, sequence_pool=False):
def embedding_layer(input,
table_name,
initializer_instance=None,
sequence_pool=False):
emb = fluid.embedding(
input=input,
size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=table_name)
if sequence_pool:
emb = fluid.layers.sequence_pool(input=emb, pool_type='average')
emb = fluid.layers.sequence_pool(
input=emb, pool_type='average')
return emb
all_label = np.arange(self.sparse_feature_number).reshape(
......@@ -166,36 +176,34 @@ class Model(ModelBase):
dist = fluid.layers.matmul(
x=target, y=emb_all_label_l2, transpose_y=True)
values, pred_idx = fluid.layers.topk(input=dist, k=4)
label = fluid.layers.expand(
inputs[3],
expand_times=[1, 4])
label = fluid.layers.expand(inputs[3], expand_times=[1, 4])
label_ones = fluid.layers.fill_constant_batch_size_like(
label, shape=[-1, 1], value=1.0, dtype='float32')
right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
fluid.layers.equal(pred_idx, label), dtype='float32'))
total_cnt = fluid.layers.reduce_sum(label_ones)
# global_right_cnt = fluid.layers.create_global_var(
# name="global_right_cnt",
# persistable=True,
# dtype='float32',
# shape=[1],
# value=0)
# global_total_cnt = fluid.layers.create_global_var(
# name="global_total_cnt",
# persistable=True,
# dtype='float32',
# shape=[1],
# value=0)
# global_right_cnt.stop_gradient = True
# global_total_cnt.stop_gradient = True
# tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt)
# fluid.layers.assign(tmp1, global_right_cnt)
# tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt)
# fluid.layers.assign(tmp2, global_total_cnt)
# acc = fluid.layers.elementwise_div(
# global_right_cnt, global_total_cnt, name="total_acc")
# global_right_cnt = fluid.layers.create_global_var(
# name="global_right_cnt",
# persistable=True,
# dtype='float32',
# shape=[1],
# value=0)
# global_total_cnt = fluid.layers.create_global_var(
# name="global_total_cnt",
# persistable=True,
# dtype='float32',
# shape=[1],
# value=0)
# global_right_cnt.stop_gradient = True
# global_total_cnt.stop_gradient = True
# tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt)
# fluid.layers.assign(tmp1, global_right_cnt)
# tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt)
# fluid.layers.assign(tmp2, global_total_cnt)
# acc = fluid.layers.elementwise_div(
# global_right_cnt, global_total_cnt, name="total_acc")
acc = fluid.layers.elementwise_div(right_cnt, total_cnt, name="acc")
self._infer_results['acc'] = acc
......@@ -45,18 +45,8 @@ def parse_args():
default=5,
help="If the word count is less then min_count, it will be removed from dict"
)
parser.add_argument(
'--min_n',
type=int,
default=3,
help="min_n of ngrams"
)
parser.add_argument(
'--max_n',
type=int,
default=5,
help="max_n of ngrams"
)
parser.add_argument('--min_n', type=int, default=3, help="min_n of ngrams")
parser.add_argument('--max_n', type=int, default=5, help="max_n of ngrams")
parser.add_argument(
'--file_nums',
type=int,
......@@ -201,6 +191,7 @@ def computeSubwords(word, min_n, max_n):
ngrams.add("".join(word[i:end]))
return list(ngrams)
def build_dict(args):
"""
proprocess the data, generate dictionary and save into dict_path.
......@@ -267,6 +258,7 @@ def build_dict(args):
f.write(" ".join(word_ngrams[key]))
f.write(u'\n')
def data_split(args):
raw_data_dir = args.input_corpus_dir
new_data_dir = args.output_corpus_dir
......
......@@ -40,8 +40,10 @@ class NumpyRandomInt(object):
class TrainReader(Reader):
def init(self):
dict_path = envs.get_global_env("dataset.dataset_train.word_count_dict_path")
word_ngrams_path = envs.get_global_env("dataset.dataset_train.word_ngrams_path")
dict_path = envs.get_global_env(
"dataset.dataset_train.word_count_dict_path")
word_ngrams_path = envs.get_global_env(
"dataset.dataset_train.word_ngrams_path")
self.window_size = envs.get_global_env("hyper_parameters.window_size")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.with_shuffle_batch = envs.get_global_env(
......@@ -53,7 +55,7 @@ class TrainReader(Reader):
for line in f:
line = line.rstrip().split()
self.word_ngrams[str(line[0])] = map(int, line[1:])
self.cs = None
if not self.with_shuffle_batch:
id_counts = []
......
......@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.batch_size = envs.get_global_env("dataset.dataset_infer.batch_size")
self.batch_size = envs.get_global_env(
"dataset.dataset_infer.batch_size")
self.input = []
self.length = None
......
......@@ -27,19 +27,27 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def _init_hyper_parameters(self):
self.learning_rate = envs.get_global_env("hyper_parameters.optimizer.learning_rate")
self.decay_steps = envs.get_global_env("hyper_parameters.optimizer.decay_steps")
self.decay_rate = envs.get_global_env("hyper_parameters.optimizer.decay_rate")
self.l2 = envs.get_global_env("hyper_parameters.optimizer.l2")
self.dict_size = envs.get_global_env("hyper_parameters.sparse_feature_nums")
self.learning_rate = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")
self.decay_steps = envs.get_global_env(
"hyper_parameters.optimizer.decay_steps")
self.decay_rate = envs.get_global_env(
"hyper_parameters.optimizer.decay_rate")
self.l2 = envs.get_global_env("hyper_parameters.optimizer.l2")
self.dict_size = envs.get_global_env(
"hyper_parameters.sparse_feature_nums")
self.corpus_size = envs.get_global_env("hyper_parameters.corpus_size")
self.train_batch_size = envs.get_global_env("dataset.dataset_train.batch_size")
self.evaluate_batch_size = envs.get_global_env("dataset.dataset_infer.batch_size")
self.train_batch_size = envs.get_global_env(
"dataset.dataset_train.batch_size")
self.evaluate_batch_size = envs.get_global_env(
"dataset.dataset_infer.batch_size")
self.hidden_size = envs.get_global_env("hyper_parameters.sparse_feature_dim")
self.step = envs.get_global_env("hyper_parameters.gnn_propogation_steps")
self.hidden_size = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.step = envs.get_global_env(
"hyper_parameters.gnn_propogation_steps")
def input_data(self, is_infer=False, **kwargs):
if is_infer:
......@@ -66,9 +74,7 @@ class Model(ModelBase):
label = fluid.data(
name="label", shape=[bs, 1], dtype="int64") # [batch_size, 1]
res = [
items, seq_index, last_index, adj_in, adj_out, mask, label
]
res = [items, seq_index, last_index, adj_in, adj_out, mask, label]
return res
def net(self, inputs, is_infer=False):
......@@ -76,7 +82,7 @@ class Model(ModelBase):
bs = self.evaluate_batch_size
else:
bs = self.train_batch_size
stdv = 1.0 / math.sqrt(self.hidden_size)
def embedding_layer(input,
......@@ -124,7 +130,8 @@ class Model(ModelBase):
state_adj_in = layers.matmul(inputs[3],
state_in) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(inputs[4], state_out) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(
inputs[4], state_out) # [batch_size, uniq_max, h]
gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)
......@@ -140,7 +147,8 @@ class Model(ModelBase):
x=pre_state, shape=[-1, self.hidden_size]),
size=3 * self.hidden_size)
final_state = layers.reshape(pre_state, shape=[bs, -1, self.hidden_size])
final_state = layers.reshape(
pre_state, shape=[bs, -1, self.hidden_size])
seq = layers.gather_nd(final_state, inputs[1])
last = layers.gather_nd(final_state, inputs[2])
......@@ -231,14 +239,13 @@ class Model(ModelBase):
self._cost = self.loss
if is_infer:
self._infer_results['acc'] = self.acc
self._infer_results['acc'] = self.acc
self._infer_results['loss'] = self.loss
return
self._metrics["LOSS"] = self.loss
self._metrics["LOSS"] = self.loss
self._metrics["train_acc"] = self.acc
def optimizer(self):
step_per_epoch = self.corpus_size // self.train_batch_size
optimizer = fluid.optimizer.Adam(
......@@ -249,4 +256,3 @@ class Model(ModelBase):
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=self.l2))
return optimizer
......@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.batch_size = envs.get_global_env("dataset.dataset_train.batch_size")
self.batch_size = envs.get_global_env(
"dataset.dataset_train.batch_size")
self.input = []
self.length = None
......
......@@ -11,51 +11,70 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
evaluate:
workspace: "paddlerec.models.recall.word2vec"
workspace: "paddlerec.models.recall.word2vec"
evaluate_only: False
evaluate_model_path: ""
reader:
batch_size: 50
class: "{workspace}/w2v_evaluate_reader.py"
test_data_path: "{workspace}/data/test"
word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
# list of dataset
dataset:
- name: dataset_train # name of dataset to distinguish different datasets
batch_size: 100
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/train"
word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt"
data_converter: "{workspace}/w2v_reader.py"
- name: dataset_infer # name
batch_size: 50
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/test"
word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
data_converter: "{workspace}/w2v_evaluate_reader.py"
train:
trainer:
# for cluster training
strategy: "async"
hyper_parameters:
optimizer:
learning_rate: 1.0
decay_steps: 100000
decay_rate: 0.999
class: sgd
strategy: async
sparse_feature_number: 85
sparse_feature_dim: 300
with_shuffle_batch: False
neg_num: 5
window_size: 5
# select runner by name
mode: runner1
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner:
- name: runner1
class: single_train
# num of epochs
epochs: 2
workspace: "paddlerec.models.recall.word2vec"
# device to run training or infer
device: cpu
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path
fetch_period: 10
- name: runner2
class: single_infer
# num of epochs
epochs: 1
# device to run training or infer
device: cpu
init_model_path: "increment/0" # load model path
reader:
batch_size: 100
class: "{workspace}/w2v_reader.py"
train_data_path: "{workspace}/data/train"
word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt"
model:
models: "{workspace}/model.py"
hyper_parameters:
sparse_feature_number: 85
sparse_feature_dim: 300
with_shuffle_batch: False
neg_num: 5
window_size: 5
learning_rate: 1.0
decay_steps: 100000
decay_rate: 0.999
optimizer: sgd
save:
increment:
dirname: "increment"
epoch_interval: 1
save_last: True
inference:
dirname: "inference"
epoch_interval: 1
save_last: True
# runner will run all the phase in each epoch
phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
......@@ -23,56 +23,60 @@ class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def input(self):
neg_num = int(
envs.get_global_env("hyper_parameters.neg_num", None,
self._namespace))
self.input_word = fluid.data(
def _init_hyper_parameters(self):
self.is_distributed = True if envs.get_trainer(
) == "CtrTrainer" else False
self.sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number")
self.sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch")
self.learning_rate = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")
self.decay_steps = envs.get_global_env(
"hyper_parameters.optimizer.decay_steps")
self.decay_rate = envs.get_global_env(
"hyper_parameters.optimizer.decay_rate")
def input_data(self, is_infer=False, **kwargs):
if is_infer:
analogy_a = fluid.data(
name="analogy_a", shape=[None], dtype='int64')
analogy_b = fluid.data(
name="analogy_b", shape=[None], dtype='int64')
analogy_c = fluid.data(
name="analogy_c", shape=[None], dtype='int64')
analogy_d = fluid.data(
name="analogy_d", shape=[None], dtype='int64')
return [analogy_a, analogy_b, analogy_c, analogy_d]
input_word = fluid.data(
name="input_word", shape=[None, 1], dtype='int64')
self.true_word = fluid.data(
true_word = fluid.data(
name='true_label', shape=[None, 1], dtype='int64')
self._data_var.append(self.input_word)
self._data_var.append(self.true_word)
with_shuffle_batch = bool(
int(
envs.get_global_env("hyper_parameters.with_shuffle_batch",
None, self._namespace)))
if not with_shuffle_batch:
self.neg_word = fluid.data(
name="neg_label", shape=[None, neg_num], dtype='int64')
self._data_var.append(self.neg_word)
if self.with_shuffle_batch:
return [input_word, true_word]
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
neg_word = fluid.data(
name="neg_label", shape=[None, self.neg_num], dtype='int64')
return [input_word, true_word, neg_word]
def net(self):
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
neg_num = int(
envs.get_global_env("hyper_parameters.neg_num", None,
self._namespace))
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
with_shuffle_batch = bool(
int(
envs.get_global_env("hyper_parameters.with_shuffle_batch",
None, self._namespace)))
def net(self, inputs, is_infer=False):
if is_infer:
self.infer_net(inputs)
return
def embedding_layer(input,
table_name,
emb_dim,
initializer_instance=None,
squeeze=False):
emb = fluid.embedding(
input=input,
is_sparse=True,
is_distributed=is_distributed,
size=[sparse_feature_number, emb_dim],
is_distributed=self.is_distributed,
size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=fluid.ParamAttr(
name=table_name, initializer=initializer_instance), )
if squeeze:
......@@ -80,115 +84,60 @@ class Model(ModelBase):
else:
return emb
init_width = 0.5 / sparse_feature_dim
init_width = 1.0 / self.sparse_feature_dim
emb_initializer = fluid.initializer.Uniform(-init_width, init_width)
emb_w_initializer = fluid.initializer.Constant(value=0.0)
input_emb = embedding_layer(self.input_word, "emb", sparse_feature_dim,
emb_initializer, True)
true_emb_w = embedding_layer(self.true_word, "emb_w",
sparse_feature_dim, emb_w_initializer,
input_emb = embedding_layer(inputs[0], "emb", emb_initializer, True)
true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer,
True)
true_emb_b = embedding_layer(self.true_word, "emb_b", 1,
emb_w_initializer, True)
if with_shuffle_batch:
if self.with_shuffle_batch:
neg_emb_w_list = []
for i in range(neg_num):
for i in range(self.neg_num):
neg_emb_w_list.append(
fluid.contrib.layers.shuffle_batch(
true_emb_w)) # shuffle true_word
neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
neg_emb_w = fluid.layers.reshape(
neg_emb_w_concat, shape=[-1, neg_num, sparse_feature_dim])
neg_emb_b_list = []
for i in range(neg_num):
neg_emb_b_list.append(
fluid.contrib.layers.shuffle_batch(
true_emb_b)) # shuffle true_word
neg_emb_b = fluid.layers.concat(neg_emb_b_list, axis=0)
neg_emb_b_vec = fluid.layers.reshape(
neg_emb_b, shape=[-1, neg_num])
neg_emb_w_concat,
shape=[-1, self.neg_num, self.sparse_feature_dim])
else:
neg_emb_w = embedding_layer(self.neg_word, "emb_w",
sparse_feature_dim, emb_w_initializer)
neg_emb_b = embedding_layer(self.neg_word, "emb_b", 1,
emb_w_initializer)
neg_emb_b_vec = fluid.layers.reshape(
neg_emb_b, shape=[-1, neg_num])
true_logits = fluid.layers.elementwise_add(
fluid.layers.reduce_sum(
fluid.layers.elementwise_mul(input_emb, true_emb_w),
dim=1,
keep_dim=True),
true_emb_b)
neg_emb_w = embedding_layer(inputs[2], "emb_w", emb_w_initializer)
true_logits = fluid.layers.reduce_sum(
fluid.layers.elementwise_mul(input_emb, true_emb_w),
dim=1,
keep_dim=True)
input_emb_re = fluid.layers.reshape(
input_emb, shape=[-1, 1, sparse_feature_dim])
input_emb, shape=[-1, 1, self.sparse_feature_dim])
neg_matmul = fluid.layers.matmul(
input_emb_re, neg_emb_w, transpose_y=True)
neg_logits = fluid.layers.elementwise_add(
fluid.layers.reshape(
neg_matmul, shape=[-1, neg_num]),
neg_emb_b_vec)
label_ones = fluid.layers.fill_constant_batch_size_like(
true_logits, shape=[-1, 1], value=1.0, dtype='float32')
label_zeros = fluid.layers.fill_constant_batch_size_like(
true_logits, shape=[-1, neg_num], value=0.0, dtype='float32')
true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits,
label_ones)
neg_xent = fluid.layers.sigmoid_cross_entropy_with_logits(neg_logits,
label_zeros)
cost = fluid.layers.elementwise_add(
fluid.layers.reduce_sum(
true_xent, dim=1),
fluid.layers.reduce_sum(
neg_xent, dim=1))
self.avg_cost = fluid.layers.reduce_mean(cost)
global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True
neg_logits = fluid.layers.reshape(neg_matmul, shape=[-1, 1])
def avg_loss(self):
self._cost = self.avg_cost
logits = fluid.layers.concat([true_logits, neg_logits], axis=0)
label_ones = fluid.layers.fill_constant(
shape=[fluid.layers.shape(true_logits)[0], 1],
value=1.0,
dtype='float32')
label_zeros = fluid.layers.fill_constant(
shape=[fluid.layers.shape(neg_logits)[0], 1],
value=0.0,
dtype='float32')
label = fluid.layers.concat([label_ones, label_zeros], axis=0)
def metrics(self):
self._metrics["LOSS"] = self.avg_cost
loss = fluid.layers.log_loss(fluid.layers.sigmoid(logits), label)
avg_cost = fluid.layers.reduce_sum(loss)
def train_net(self):
self.input()
self.net()
self.avg_loss()
self.metrics()
self._cost = avg_cost
self._metrics["LOSS"] = avg_cost
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
self._namespace)
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
self._namespace)
optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=learning_rate,
decay_steps=decay_steps,
decay_rate=decay_rate,
learning_rate=self.learning_rate,
decay_steps=self.decay_steps,
decay_rate=self.decay_rate,
staircase=True))
return optimizer
......@@ -213,28 +162,22 @@ class Model(ModelBase):
use_double_buffer=False,
iterable=False)
def infer_net(self):
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
def infer_net(self, inputs):
def embedding_layer(input, table_name, initializer_instance=None):
emb = fluid.embedding(
input=input,
size=[sparse_feature_number, sparse_feature_dim],
size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=table_name)
return emb
self.analogy_input()
all_label = np.arange(sparse_feature_number).reshape(
sparse_feature_number).astype('int32')
all_label = np.arange(self.sparse_feature_number).reshape(
self.sparse_feature_number).astype('int32')
self.all_label = fluid.layers.cast(
x=fluid.layers.assign(all_label), dtype='int64')
emb_all_label = embedding_layer(self.all_label, "emb")
emb_a = embedding_layer(self.analogy_a, "emb")
emb_b = embedding_layer(self.analogy_b, "emb")
emb_c = embedding_layer(self.analogy_c, "emb")
emb_a = embedding_layer(inputs[0], "emb")
emb_b = embedding_layer(inputs[1], "emb")
emb_c = embedding_layer(inputs[2], "emb")
target = fluid.layers.elementwise_add(
fluid.layers.elementwise_sub(emb_b, emb_a), emb_c)
......@@ -245,34 +188,34 @@ class Model(ModelBase):
values, pred_idx = fluid.layers.topk(input=dist, k=4)
label = fluid.layers.expand(
fluid.layers.unsqueeze(
self.analogy_d, axes=[1]),
expand_times=[1, 4])
inputs[3], axes=[1]), expand_times=[1, 4])
label_ones = fluid.layers.fill_constant_batch_size_like(
label, shape=[-1, 1], value=1.0, dtype='float32')
right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
fluid.layers.equal(pred_idx, label), dtype='float32'))
total_cnt = fluid.layers.reduce_sum(label_ones)
global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True
tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt)
fluid.layers.assign(tmp1, global_right_cnt)
tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt)
fluid.layers.assign(tmp2, global_total_cnt)
acc = fluid.layers.elementwise_div(
global_right_cnt, global_total_cnt, name="total_acc")
# global_right_cnt = fluid.layers.create_global_var(
# name="global_right_cnt",
# persistable=True,
# dtype='float32',
# shape=[1],
# value=0)
# global_total_cnt = fluid.layers.create_global_var(
# name="global_total_cnt",
# persistable=True,
# dtype='float32',
# shape=[1],
# value=0)
# global_right_cnt.stop_gradient = True
# global_total_cnt.stop_gradient = True
# tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt)
# fluid.layers.assign(tmp1, global_right_cnt)
# tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt)
# fluid.layers.assign(tmp2, global_total_cnt)
# acc = fluid.layers.elementwise_div(
# global_right_cnt, global_total_cnt, name="total_acc")
acc = fluid.layers.elementwise_div(right_cnt, total_cnt, name="acc")
self._infer_results['acc'] = acc
......@@ -20,10 +20,10 @@ from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class EvaluateReader(Reader):
class TrainReader(Reader):
def init(self):
dict_path = envs.get_global_env("word_id_dict_path", None,
"evaluate.reader")
dict_path = envs.get_global_env(
"dataset.dataset_infer.word_id_dict_path")
self.word_to_id = dict()
self.id_to_word = dict()
with io.open(dict_path, 'r', encoding='utf-8') as f:
......
......@@ -40,14 +40,12 @@ class NumpyRandomInt(object):
class TrainReader(Reader):
def init(self):
dict_path = envs.get_global_env("word_count_dict_path", None,
"train.reader")
self.window_size = envs.get_global_env("hyper_parameters.window_size",
None, "train.model")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None,
"train.model")
dict_path = envs.get_global_env(
"dataset.dataset_train.word_count_dict_path")
self.window_size = envs.get_global_env("hyper_parameters.window_size")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch", None, "train.model")
"hyper_parameters.with_shuffle_batch")
self.random_generator = NumpyRandomInt(1, self.window_size + 1)
self.cs = None
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册