提交 40c2beda 编写于 作者: M malin10

fix_code_style

上级 edec99eb
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
workspace: "paddlerec.models.match.dssm" workspace: "paddlerec.models.match.dssm"
dataset: dataset:
......
...@@ -27,11 +27,15 @@ class Model(ModelBase): ...@@ -27,11 +27,15 @@ class Model(ModelBase):
self.Neg = envs.get_global_env("hyper_parameters.NEG") self.Neg = envs.get_global_env("hyper_parameters.NEG")
self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes") self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes")
self.hidden_acts = envs.get_global_env("hyper_parameters.fc_acts") self.hidden_acts = envs.get_global_env("hyper_parameters.fc_acts")
self.learning_rate = envs.get_global_env("hyper_parameters.learning_rate") self.learning_rate = envs.get_global_env(
"hyper_parameters.learning_rate")
def input_data(self, is_infer=False, **kwargs): def input_data(self, is_infer=False, **kwargs):
query = fluid.data( query = fluid.data(
name="query", shape=[-1, self.TRIGRAM_D], dtype='float32', lod_level=0) name="query",
shape=[-1, self.TRIGRAM_D],
dtype='float32',
lod_level=0)
doc_pos = fluid.data( doc_pos = fluid.data(
name="doc_pos", name="doc_pos",
shape=[-1, self.TRIGRAM_D], shape=[-1, self.TRIGRAM_D],
...@@ -78,14 +82,14 @@ class Model(ModelBase): ...@@ -78,14 +82,14 @@ class Model(ModelBase):
return return
R_Q_D_ns = [] R_Q_D_ns = []
for i in range(len(inputs)-2): for i in range(len(inputs) - 2):
doc_neg_fc_i = fc(inputs[i+2], self.hidden_layers, self.hidden_acts, [ doc_neg_fc_i = fc(
inputs[i + 2], self.hidden_layers, self.hidden_acts, [
'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
'doc_neg_l3_' + str(i) 'doc_neg_l3_' + str(i)
]) ])
R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i)) R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i))
concat_Rs = fluid.layers.concat( concat_Rs = fluid.layers.concat(input=[R_Q_D_p] + R_Q_D_ns, axis=-1)
input=[R_Q_D_p] + R_Q_D_ns, axis=-1)
prob = fluid.layers.softmax(concat_Rs, axis=1) prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice( hit_prob = fluid.layers.slice(
...@@ -94,4 +98,3 @@ class Model(ModelBase): ...@@ -94,4 +98,3 @@ class Model(ModelBase):
avg_cost = fluid.layers.mean(x=loss) avg_cost = fluid.layers.mean(x=loss)
self._cost = avg_cost self._cost = avg_cost
self._metrics["LOSS"] = avg_cost self._metrics["LOSS"] = avg_cost
# DSSM
## 简介
DSSM[《Learning Deep Structured Semantic Models for Web Search using Clickthrough Data》]( https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf )即基于深度网络的语义模型,其核心思想是将query和doc映射到共同维度的语义空间中,通过最大化query和doc语义向量之间的余弦相似度,从而训练得到隐含语义模型,达到检索的目的,并通过word hashing方法来减少输入向量的维度。DSSM有很广泛的应用,比如:搜索引擎检索,广告相关性,问答系统,机器翻译等。
本项目按照论文的网络结构在paddlepaddle上实现DSSM模型,并构造数据集验证网络的正确性。
## 模型超参
```
optimizer:
class: sgd # 优化器
learning_rate: 0.01 # 学习率
strategy: async # 参数更新方式
TRIGRAM_D: 1000 # query和doc语义向量长度
NEG: 4 # 负采样个数
fc_sizes: [300, 300, 128] # fc层大小
fc_acts: ['tanh', 'tanh', 'tanh'] # fc层激活函数
```
## 快速开始
PaddleRec内置了demo小数据,方便用户快速使用模型,训练命令如下:
```bash
python -m paddlerec.run -m paddlerec.models.match.dssm
```
执行预测前,需更改config.yaml中的配置,具体改动如下:
```
workspace: "~/code/paddlerec/models/match/dssm" # 改为当前config.yaml所在的绝对路径
#mode: runner1 # train
mode: runner2 # infer
runner:
- name: runner2
class: single_infer
init_model_path: "increment/2" # 改为需要预测的模型路径
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: dataset_infer # 改成预测dataset
thread_num: 1 # dataset线程数
```
改完之后,执行预测命令:
```
python -m paddlerec.run -m ./config.yaml
```
## 提测说明
当前,DSSM模型采用的数据集是随机构造的,因此提测仅需按上述步骤在demo数据集上跑通即可。
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
from paddlerec.core.reader import Reader from paddlerec.core.reader import Reader
class EvaluateReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
pass pass
......
...@@ -101,12 +101,17 @@ class Model(ModelBase): ...@@ -101,12 +101,17 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def _init_hyper_parameters(self): def _init_hyper_parameters(self):
self.query_encoder = envs.get_global_env("hyper_parameters.query_encoder") self.query_encoder = envs.get_global_env(
self.title_encoder = envs.get_global_env("hyper_parameters.title_encoder") "hyper_parameters.query_encoder")
self.query_encode_dim = envs.get_global_env("hyper_parameters.query_encode_dim") self.title_encoder = envs.get_global_env(
self.title_encode_dim = envs.get_global_env("hyper_parameters.title_encode_dim") "hyper_parameters.title_encoder")
self.query_encode_dim = envs.get_global_env(
self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim") "hyper_parameters.query_encode_dim")
self.title_encode_dim = envs.get_global_env(
"hyper_parameters.title_encode_dim")
self.emb_size = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim") self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim")
self.emb_shape = [self.emb_size, self.emb_dim] self.emb_shape = [self.emb_size, self.emb_dim]
...@@ -137,7 +142,6 @@ class Model(ModelBase): ...@@ -137,7 +142,6 @@ class Model(ModelBase):
param_attr='q_fc.w', param_attr='q_fc.w',
bias_attr='q_fc.b') bias_attr='q_fc.b')
self.pt_slots = self._sparse_data_var[1:2] self.pt_slots = self._sparse_data_var[1:2]
self.title_encoders = [ self.title_encoders = [
factory.create(self.title_encoder, self.title_encode_dim) factory.create(self.title_encoder, self.title_encode_dim)
......
# Multi-view Simnet for Personalized recommendation
## 简介
在个性化推荐场景中,推荐系统给用户提供的项目(Item)列表通常是通过个性化的匹配模型计算出来的。在现实世界中,一个用户可能有很多个视角的特征,比如用户Id,年龄,项目的点击历史等。一个项目,举例来说,新闻资讯,也会有多种视角的特征比如新闻标题,新闻类别等。Multi-view Simnet模型是可以融合用户以及推荐项目的多个视角的特征并进行个性化匹配学习的一体化模型。这类模型在很多工业化的场景中都会被使用到,比如百度的Feed产品中。
本项目的目标是提供一个在个性化匹配场景下利用Paddle搭建的模型。Multi-view Simnet模型包括多个编码器模块,每个编码器被用在不同的特征视角上。当前,项目中提供Bag-of-Embedding编码器,Temporal-Convolutional编码器,和Gated-Recurrent-Unit编码器。我们会逐渐加入稀疏特征场景下比较实用的编码器到这个项目中。模型的训练方法,当前采用的是Pairwise ranking模式进行训练,即针对一对具有关联的User-Item组合,并随机产出一个Item作为负例进行排序学习。
## 模型超参
```
optimizer:
class: Adam # 优化器类型
learning_rate: 0.0001 # 学习率
strategy: async # 参数更新方式
query_encoder: "bow" # 用户特征编码器
title_encoder: "bow" # item特征编码器
query_encode_dim: 128 # 用户编码器产出的特征维度
title_encode_dim: 128 # item编码器产出的特征维度
sparse_feature_dim: 1000001 # 用户特征及item特征,所有特征总个数
embedding_dim: 128 # 特征维度
hidden_size: 128 # 隐藏层维度
margin: 0.1 # max margin for hinge-loss
```
## 快速开始
PaddleRec内置了demo小数据,方便用户快速使用模型,训练命令如下:
```bash
python -m paddlerec.run -m paddlerec.models.match.multiview-simnet
```
执行预测前,需更改config.yaml中的配置,具体改动如下:
```
workspace: "~/code/paddlerec/models/match/multiview-simnet" # 改为当前config.yaml所在的绝对路径
#mode: runner1 # train
mode: runner2 # infer
runner:
- name: runner2
class: single_infer
init_model_path: "increment/2" # 改为需要预测的模型路径
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: dataset_infer # 改成预测dataset
thread_num: 1 # dataset线程数
```
改完之后,执行预测命令:
```
python -m paddlerec.run -m ./config.yaml
```
## 提测说明
当前,Multi-view Simnet模型采用的数据集是机器随机构造的,因此提测仅需按上述步骤在demo数据集上跑通即可。
...@@ -22,7 +22,8 @@ from paddlerec.core.utils import envs ...@@ -22,7 +22,8 @@ from paddlerec.core.utils import envs
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
dict_path = envs.get_global_env("dataset.dataset_infer.word_id_dict_path") dict_path = envs.get_global_env(
"dataset.dataset_infer.word_id_dict_path")
self.min_n = envs.get_global_env("hyper_parameters.min_n") self.min_n = envs.get_global_env("hyper_parameters.min_n")
self.max_n = envs.get_global_env("hyper_parameters.max_n") self.max_n = envs.get_global_env("hyper_parameters.max_n")
self.word_to_id = dict() self.word_to_id = dict()
...@@ -78,7 +79,8 @@ class TrainReader(Reader): ...@@ -78,7 +79,8 @@ class TrainReader(Reader):
a unicode string - a space-delimited sequence of words. a unicode string - a space-delimited sequence of words.
""" """
return u" ".join([ return u" ".join([
"<" + word + ">" if "<" + word + ">" in original_vocab else u"<UNK>" "<" + word + ">"
if "<" + word + ">" in original_vocab else u"<UNK>"
for word in line.split() for word in line.split()
]) ])
...@@ -99,9 +101,7 @@ class TrainReader(Reader): ...@@ -99,9 +101,7 @@ class TrainReader(Reader):
res.append(self.word_to_id[_]) res.append(self.word_to_id[_])
inputs.append(res) inputs.append(res)
print(inputs) print(inputs)
yield [('analogy_a', inputs[0]), yield [('analogy_a', inputs[0]), ('analogy_b', inputs[1]),
('analogy_b', inputs[1]), ('analogy_c', inputs[2]), ('analogy_d', inputs[3][0:1])]
('analogy_c', inputs[2]),
('analogy_d', inputs[3][0:1])]
return reader return reader
...@@ -24,15 +24,21 @@ class Model(ModelBase): ...@@ -24,15 +24,21 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def _init_hyper_parameters(self): def _init_hyper_parameters(self):
self.is_distributed = True if envs.get_trainer() == "CtrTrainer" else False self.is_distributed = True if envs.get_trainer(
self.sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number") ) == "CtrTrainer" else False
self.sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim") self.sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number")
self.sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num") self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.with_shuffle_batch = envs.get_global_env("hyper_parameters.with_shuffle_batch") self.with_shuffle_batch = envs.get_global_env(
self.learning_rate = envs.get_global_env("hyper_parameters.optimizer.learning_rate") "hyper_parameters.with_shuffle_batch")
self.decay_steps = envs.get_global_env("hyper_parameters.optimizer.decay_steps") self.learning_rate = envs.get_global_env(
self.decay_rate = envs.get_global_env("hyper_parameters.optimizer.decay_rate") "hyper_parameters.optimizer.learning_rate")
self.decay_steps = envs.get_global_env(
"hyper_parameters.optimizer.decay_steps")
self.decay_rate = envs.get_global_env(
"hyper_parameters.optimizer.decay_rate")
def input_data(self, is_infer=False, **kwargs): def input_data(self, is_infer=False, **kwargs):
if is_infer: if is_infer:
...@@ -74,7 +80,8 @@ class Model(ModelBase): ...@@ -74,7 +80,8 @@ class Model(ModelBase):
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=table_name, initializer=initializer_instance), ) name=table_name, initializer=initializer_instance), )
if sequence_pool: if sequence_pool:
emb = fluid.layers.sequence_pool(input=emb, pool_type='average') emb = fluid.layers.sequence_pool(
input=emb, pool_type='average')
return emb return emb
init_width = 1.0 / self.sparse_feature_dim init_width = 1.0 / self.sparse_feature_dim
...@@ -83,10 +90,10 @@ class Model(ModelBase): ...@@ -83,10 +90,10 @@ class Model(ModelBase):
input_emb = embedding_layer(inputs[0], "emb", emb_initializer, True) input_emb = embedding_layer(inputs[0], "emb", emb_initializer, True)
input_emb = fluid.layers.squeeze(input=input_emb, axes=[1]) input_emb = fluid.layers.squeeze(input=input_emb, axes=[1])
true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer, True) true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer,
True)
true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1]) true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1])
if self.with_shuffle_batch: if self.with_shuffle_batch:
neg_emb_w_list = [] neg_emb_w_list = []
for i in range(self.neg_num): for i in range(self.neg_num):
...@@ -95,7 +102,8 @@ class Model(ModelBase): ...@@ -95,7 +102,8 @@ class Model(ModelBase):
true_emb_w)) # shuffle true_word true_emb_w)) # shuffle true_word
neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0) neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
neg_emb_w = fluid.layers.reshape( neg_emb_w = fluid.layers.reshape(
neg_emb_w_concat, shape=[-1, self.neg_num, self.sparse_feature_dim]) neg_emb_w_concat,
shape=[-1, self.neg_num, self.sparse_feature_dim])
else: else:
neg_emb_w = embedding_layer(inputs[2], "emb_w", emb_w_initializer) neg_emb_w = embedding_layer(inputs[2], "emb_w", emb_w_initializer)
true_logits = fluid.layers.reduce_sum( true_logits = fluid.layers.reduce_sum(
...@@ -107,8 +115,7 @@ class Model(ModelBase): ...@@ -107,8 +115,7 @@ class Model(ModelBase):
input_emb, shape=[-1, 1, self.sparse_feature_dim]) input_emb, shape=[-1, 1, self.sparse_feature_dim])
neg_matmul = fluid.layers.matmul( neg_matmul = fluid.layers.matmul(
input_emb_re, neg_emb_w, transpose_y=True) input_emb_re, neg_emb_w, transpose_y=True)
neg_logits = fluid.layers.reshape( neg_logits = fluid.layers.reshape(neg_matmul, shape=[-1, 1])
neg_matmul, shape=[-1, 1])
logits = fluid.layers.concat([true_logits, neg_logits], axis=0) logits = fluid.layers.concat([true_logits, neg_logits], axis=0)
label_ones = fluid.layers.fill_constant( label_ones = fluid.layers.fill_constant(
...@@ -126,7 +133,6 @@ class Model(ModelBase): ...@@ -126,7 +133,6 @@ class Model(ModelBase):
self._cost = avg_cost self._cost = avg_cost
self._metrics["LOSS"] = avg_cost self._metrics["LOSS"] = avg_cost
def optimizer(self): def optimizer(self):
optimizer = fluid.optimizer.SGD( optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay( learning_rate=fluid.layers.exponential_decay(
...@@ -137,13 +143,17 @@ class Model(ModelBase): ...@@ -137,13 +143,17 @@ class Model(ModelBase):
return optimizer return optimizer
def infer_net(self, inputs): def infer_net(self, inputs):
def embedding_layer(input, table_name, initializer_instance=None, sequence_pool=False): def embedding_layer(input,
table_name,
initializer_instance=None,
sequence_pool=False):
emb = fluid.embedding( emb = fluid.embedding(
input=input, input=input,
size=[self.sparse_feature_number, self.sparse_feature_dim], size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=table_name) param_attr=table_name)
if sequence_pool: if sequence_pool:
emb = fluid.layers.sequence_pool(input=emb, pool_type='average') emb = fluid.layers.sequence_pool(
input=emb, pool_type='average')
return emb return emb
all_label = np.arange(self.sparse_feature_number).reshape( all_label = np.arange(self.sparse_feature_number).reshape(
...@@ -166,9 +176,7 @@ class Model(ModelBase): ...@@ -166,9 +176,7 @@ class Model(ModelBase):
dist = fluid.layers.matmul( dist = fluid.layers.matmul(
x=target, y=emb_all_label_l2, transpose_y=True) x=target, y=emb_all_label_l2, transpose_y=True)
values, pred_idx = fluid.layers.topk(input=dist, k=4) values, pred_idx = fluid.layers.topk(input=dist, k=4)
label = fluid.layers.expand( label = fluid.layers.expand(inputs[3], expand_times=[1, 4])
inputs[3],
expand_times=[1, 4])
label_ones = fluid.layers.fill_constant_batch_size_like( label_ones = fluid.layers.fill_constant_batch_size_like(
label, shape=[-1, 1], value=1.0, dtype='float32') label, shape=[-1, 1], value=1.0, dtype='float32')
right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast( right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
......
...@@ -45,18 +45,8 @@ def parse_args(): ...@@ -45,18 +45,8 @@ def parse_args():
default=5, default=5,
help="If the word count is less then min_count, it will be removed from dict" help="If the word count is less then min_count, it will be removed from dict"
) )
parser.add_argument( parser.add_argument('--min_n', type=int, default=3, help="min_n of ngrams")
'--min_n', parser.add_argument('--max_n', type=int, default=5, help="max_n of ngrams")
type=int,
default=3,
help="min_n of ngrams"
)
parser.add_argument(
'--max_n',
type=int,
default=5,
help="max_n of ngrams"
)
parser.add_argument( parser.add_argument(
'--file_nums', '--file_nums',
type=int, type=int,
...@@ -201,6 +191,7 @@ def computeSubwords(word, min_n, max_n): ...@@ -201,6 +191,7 @@ def computeSubwords(word, min_n, max_n):
ngrams.add("".join(word[i:end])) ngrams.add("".join(word[i:end]))
return list(ngrams) return list(ngrams)
def build_dict(args): def build_dict(args):
""" """
proprocess the data, generate dictionary and save into dict_path. proprocess the data, generate dictionary and save into dict_path.
...@@ -267,6 +258,7 @@ def build_dict(args): ...@@ -267,6 +258,7 @@ def build_dict(args):
f.write(" ".join(word_ngrams[key])) f.write(" ".join(word_ngrams[key]))
f.write(u'\n') f.write(u'\n')
def data_split(args): def data_split(args):
raw_data_dir = args.input_corpus_dir raw_data_dir = args.input_corpus_dir
new_data_dir = args.output_corpus_dir new_data_dir = args.output_corpus_dir
......
...@@ -40,8 +40,10 @@ class NumpyRandomInt(object): ...@@ -40,8 +40,10 @@ class NumpyRandomInt(object):
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
dict_path = envs.get_global_env("dataset.dataset_train.word_count_dict_path") dict_path = envs.get_global_env(
word_ngrams_path = envs.get_global_env("dataset.dataset_train.word_ngrams_path") "dataset.dataset_train.word_count_dict_path")
word_ngrams_path = envs.get_global_env(
"dataset.dataset_train.word_ngrams_path")
self.window_size = envs.get_global_env("hyper_parameters.window_size") self.window_size = envs.get_global_env("hyper_parameters.window_size")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num") self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.with_shuffle_batch = envs.get_global_env( self.with_shuffle_batch = envs.get_global_env(
......
...@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs ...@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
self.batch_size = envs.get_global_env("dataset.dataset_infer.batch_size") self.batch_size = envs.get_global_env(
"dataset.dataset_infer.batch_size")
self.input = [] self.input = []
self.length = None self.length = None
......
...@@ -27,19 +27,27 @@ class Model(ModelBase): ...@@ -27,19 +27,27 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def _init_hyper_parameters(self): def _init_hyper_parameters(self):
self.learning_rate = envs.get_global_env("hyper_parameters.optimizer.learning_rate") self.learning_rate = envs.get_global_env(
self.decay_steps = envs.get_global_env("hyper_parameters.optimizer.decay_steps") "hyper_parameters.optimizer.learning_rate")
self.decay_rate = envs.get_global_env("hyper_parameters.optimizer.decay_rate") self.decay_steps = envs.get_global_env(
"hyper_parameters.optimizer.decay_steps")
self.decay_rate = envs.get_global_env(
"hyper_parameters.optimizer.decay_rate")
self.l2 = envs.get_global_env("hyper_parameters.optimizer.l2") self.l2 = envs.get_global_env("hyper_parameters.optimizer.l2")
self.dict_size = envs.get_global_env("hyper_parameters.sparse_feature_nums") self.dict_size = envs.get_global_env(
"hyper_parameters.sparse_feature_nums")
self.corpus_size = envs.get_global_env("hyper_parameters.corpus_size") self.corpus_size = envs.get_global_env("hyper_parameters.corpus_size")
self.train_batch_size = envs.get_global_env("dataset.dataset_train.batch_size") self.train_batch_size = envs.get_global_env(
self.evaluate_batch_size = envs.get_global_env("dataset.dataset_infer.batch_size") "dataset.dataset_train.batch_size")
self.evaluate_batch_size = envs.get_global_env(
"dataset.dataset_infer.batch_size")
self.hidden_size = envs.get_global_env("hyper_parameters.sparse_feature_dim") self.hidden_size = envs.get_global_env(
self.step = envs.get_global_env("hyper_parameters.gnn_propogation_steps") "hyper_parameters.sparse_feature_dim")
self.step = envs.get_global_env(
"hyper_parameters.gnn_propogation_steps")
def input_data(self, is_infer=False, **kwargs): def input_data(self, is_infer=False, **kwargs):
if is_infer: if is_infer:
...@@ -66,9 +74,7 @@ class Model(ModelBase): ...@@ -66,9 +74,7 @@ class Model(ModelBase):
label = fluid.data( label = fluid.data(
name="label", shape=[bs, 1], dtype="int64") # [batch_size, 1] name="label", shape=[bs, 1], dtype="int64") # [batch_size, 1]
res = [ res = [items, seq_index, last_index, adj_in, adj_out, mask, label]
items, seq_index, last_index, adj_in, adj_out, mask, label
]
return res return res
def net(self, inputs, is_infer=False): def net(self, inputs, is_infer=False):
...@@ -124,7 +130,8 @@ class Model(ModelBase): ...@@ -124,7 +130,8 @@ class Model(ModelBase):
state_adj_in = layers.matmul(inputs[3], state_adj_in = layers.matmul(inputs[3],
state_in) # [batch_size, uniq_max, h] state_in) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(inputs[4], state_out) # [batch_size, uniq_max, h] state_adj_out = layers.matmul(
inputs[4], state_out) # [batch_size, uniq_max, h]
gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)
...@@ -140,7 +147,8 @@ class Model(ModelBase): ...@@ -140,7 +147,8 @@ class Model(ModelBase):
x=pre_state, shape=[-1, self.hidden_size]), x=pre_state, shape=[-1, self.hidden_size]),
size=3 * self.hidden_size) size=3 * self.hidden_size)
final_state = layers.reshape(pre_state, shape=[bs, -1, self.hidden_size]) final_state = layers.reshape(
pre_state, shape=[bs, -1, self.hidden_size])
seq = layers.gather_nd(final_state, inputs[1]) seq = layers.gather_nd(final_state, inputs[1])
last = layers.gather_nd(final_state, inputs[2]) last = layers.gather_nd(final_state, inputs[2])
...@@ -238,7 +246,6 @@ class Model(ModelBase): ...@@ -238,7 +246,6 @@ class Model(ModelBase):
self._metrics["LOSS"] = self.loss self._metrics["LOSS"] = self.loss
self._metrics["train_acc"] = self.acc self._metrics["train_acc"] = self.acc
def optimizer(self): def optimizer(self):
step_per_epoch = self.corpus_size // self.train_batch_size step_per_epoch = self.corpus_size // self.train_batch_size
optimizer = fluid.optimizer.Adam( optimizer = fluid.optimizer.Adam(
...@@ -249,4 +256,3 @@ class Model(ModelBase): ...@@ -249,4 +256,3 @@ class Model(ModelBase):
regularization=fluid.regularizer.L2DecayRegularizer( regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=self.l2)) regularization_coeff=self.l2))
return optimizer return optimizer
...@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs ...@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
self.batch_size = envs.get_global_env("dataset.dataset_train.batch_size") self.batch_size = envs.get_global_env(
"dataset.dataset_train.batch_size")
self.input = [] self.input = []
self.length = None self.length = None
......
...@@ -11,51 +11,70 @@ ...@@ -11,51 +11,70 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
evaluate: workspace: "paddlerec.models.recall.word2vec"
workspace: "paddlerec.models.recall.word2vec"
evaluate_only: False # list of dataset
evaluate_model_path: "" dataset:
- name: dataset_train # name of dataset to distinguish different datasets
reader:
batch_size: 50
class: "{workspace}/w2v_evaluate_reader.py"
test_data_path: "{workspace}/data/test"
word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
train:
trainer:
# for cluster training
strategy: "async"
epochs: 2
workspace: "paddlerec.models.recall.word2vec"
reader:
batch_size: 100 batch_size: 100
class: "{workspace}/w2v_reader.py" type: DataLoader # or QueueDataset
train_data_path: "{workspace}/data/train" data_path: "{workspace}/data/train"
word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt" word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt"
data_converter: "{workspace}/w2v_reader.py"
- name: dataset_infer # name
batch_size: 50
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/test"
word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
data_converter: "{workspace}/w2v_evaluate_reader.py"
model: hyper_parameters:
models: "{workspace}/model.py" optimizer:
hyper_parameters: learning_rate: 1.0
decay_steps: 100000
decay_rate: 0.999
class: sgd
strategy: async
sparse_feature_number: 85 sparse_feature_number: 85
sparse_feature_dim: 300 sparse_feature_dim: 300
with_shuffle_batch: False with_shuffle_batch: False
neg_num: 5 neg_num: 5
window_size: 5 window_size: 5
learning_rate: 1.0
decay_steps: 100000
decay_rate: 0.999
optimizer: sgd
save: # select runner by name
increment: mode: runner1
dirname: "increment" # config of each runner.
epoch_interval: 1 # runner is a kind of paddle training class, which wraps the train/infer process.
save_last: True runner:
inference: - name: runner1
dirname: "inference" class: single_train
epoch_interval: 1 # num of epochs
save_last: True epochs: 2
# device to run training or infer
device: cpu
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path
fetch_period: 10
- name: runner2
class: single_infer
# num of epochs
epochs: 1
# device to run training or infer
device: cpu
init_model_path: "increment/0" # load model path
# runner will run all the phase in each epoch
phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
...@@ -23,56 +23,60 @@ class Model(ModelBase): ...@@ -23,56 +23,60 @@ class Model(ModelBase):
def __init__(self, config): def __init__(self, config):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def input(self): def _init_hyper_parameters(self):
neg_num = int( self.is_distributed = True if envs.get_trainer(
envs.get_global_env("hyper_parameters.neg_num", None, ) == "CtrTrainer" else False
self._namespace)) self.sparse_feature_number = envs.get_global_env(
self.input_word = fluid.data( "hyper_parameters.sparse_feature_number")
self.sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch")
self.learning_rate = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")
self.decay_steps = envs.get_global_env(
"hyper_parameters.optimizer.decay_steps")
self.decay_rate = envs.get_global_env(
"hyper_parameters.optimizer.decay_rate")
def input_data(self, is_infer=False, **kwargs):
if is_infer:
analogy_a = fluid.data(
name="analogy_a", shape=[None], dtype='int64')
analogy_b = fluid.data(
name="analogy_b", shape=[None], dtype='int64')
analogy_c = fluid.data(
name="analogy_c", shape=[None], dtype='int64')
analogy_d = fluid.data(
name="analogy_d", shape=[None], dtype='int64')
return [analogy_a, analogy_b, analogy_c, analogy_d]
input_word = fluid.data(
name="input_word", shape=[None, 1], dtype='int64') name="input_word", shape=[None, 1], dtype='int64')
self.true_word = fluid.data( true_word = fluid.data(
name='true_label', shape=[None, 1], dtype='int64') name='true_label', shape=[None, 1], dtype='int64')
self._data_var.append(self.input_word) if self.with_shuffle_batch:
self._data_var.append(self.true_word) return [input_word, true_word]
with_shuffle_batch = bool(
int(
envs.get_global_env("hyper_parameters.with_shuffle_batch",
None, self._namespace)))
if not with_shuffle_batch:
self.neg_word = fluid.data(
name="neg_label", shape=[None, neg_num], dtype='int64')
self._data_var.append(self.neg_word)
if self._platform != "LINUX": neg_word = fluid.data(
self._data_loader = fluid.io.DataLoader.from_generator( name="neg_label", shape=[None, self.neg_num], dtype='int64')
feed_list=self._data_var, return [input_word, true_word, neg_word]
capacity=64,
use_double_buffer=False,
iterable=False)
def net(self): def net(self, inputs, is_infer=False):
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False if is_infer:
neg_num = int( self.infer_net(inputs)
envs.get_global_env("hyper_parameters.neg_num", None, return
self._namespace))
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
with_shuffle_batch = bool(
int(
envs.get_global_env("hyper_parameters.with_shuffle_batch",
None, self._namespace)))
def embedding_layer(input, def embedding_layer(input,
table_name, table_name,
emb_dim,
initializer_instance=None, initializer_instance=None,
squeeze=False): squeeze=False):
emb = fluid.embedding( emb = fluid.embedding(
input=input, input=input,
is_sparse=True, is_sparse=True,
is_distributed=is_distributed, is_distributed=self.is_distributed,
size=[sparse_feature_number, emb_dim], size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=table_name, initializer=initializer_instance), ) name=table_name, initializer=initializer_instance), )
if squeeze: if squeeze:
...@@ -80,115 +84,60 @@ class Model(ModelBase): ...@@ -80,115 +84,60 @@ class Model(ModelBase):
else: else:
return emb return emb
init_width = 0.5 / sparse_feature_dim init_width = 1.0 / self.sparse_feature_dim
emb_initializer = fluid.initializer.Uniform(-init_width, init_width) emb_initializer = fluid.initializer.Uniform(-init_width, init_width)
emb_w_initializer = fluid.initializer.Constant(value=0.0) emb_w_initializer = fluid.initializer.Constant(value=0.0)
input_emb = embedding_layer(self.input_word, "emb", sparse_feature_dim, input_emb = embedding_layer(inputs[0], "emb", emb_initializer, True)
emb_initializer, True) true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer,
true_emb_w = embedding_layer(self.true_word, "emb_w",
sparse_feature_dim, emb_w_initializer,
True) True)
true_emb_b = embedding_layer(self.true_word, "emb_b", 1,
emb_w_initializer, True)
if with_shuffle_batch: if self.with_shuffle_batch:
neg_emb_w_list = [] neg_emb_w_list = []
for i in range(neg_num): for i in range(self.neg_num):
neg_emb_w_list.append( neg_emb_w_list.append(
fluid.contrib.layers.shuffle_batch( fluid.contrib.layers.shuffle_batch(
true_emb_w)) # shuffle true_word true_emb_w)) # shuffle true_word
neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0) neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
neg_emb_w = fluid.layers.reshape( neg_emb_w = fluid.layers.reshape(
neg_emb_w_concat, shape=[-1, neg_num, sparse_feature_dim]) neg_emb_w_concat,
shape=[-1, self.neg_num, self.sparse_feature_dim])
neg_emb_b_list = []
for i in range(neg_num):
neg_emb_b_list.append(
fluid.contrib.layers.shuffle_batch(
true_emb_b)) # shuffle true_word
neg_emb_b = fluid.layers.concat(neg_emb_b_list, axis=0)
neg_emb_b_vec = fluid.layers.reshape(
neg_emb_b, shape=[-1, neg_num])
else: else:
neg_emb_w = embedding_layer(self.neg_word, "emb_w", neg_emb_w = embedding_layer(inputs[2], "emb_w", emb_w_initializer)
sparse_feature_dim, emb_w_initializer) true_logits = fluid.layers.reduce_sum(
neg_emb_b = embedding_layer(self.neg_word, "emb_b", 1,
emb_w_initializer)
neg_emb_b_vec = fluid.layers.reshape(
neg_emb_b, shape=[-1, neg_num])
true_logits = fluid.layers.elementwise_add(
fluid.layers.reduce_sum(
fluid.layers.elementwise_mul(input_emb, true_emb_w), fluid.layers.elementwise_mul(input_emb, true_emb_w),
dim=1, dim=1,
keep_dim=True), keep_dim=True)
true_emb_b)
input_emb_re = fluid.layers.reshape( input_emb_re = fluid.layers.reshape(
input_emb, shape=[-1, 1, sparse_feature_dim]) input_emb, shape=[-1, 1, self.sparse_feature_dim])
neg_matmul = fluid.layers.matmul( neg_matmul = fluid.layers.matmul(
input_emb_re, neg_emb_w, transpose_y=True) input_emb_re, neg_emb_w, transpose_y=True)
neg_logits = fluid.layers.elementwise_add( neg_logits = fluid.layers.reshape(neg_matmul, shape=[-1, 1])
fluid.layers.reshape(
neg_matmul, shape=[-1, neg_num]),
neg_emb_b_vec)
label_ones = fluid.layers.fill_constant_batch_size_like(
true_logits, shape=[-1, 1], value=1.0, dtype='float32')
label_zeros = fluid.layers.fill_constant_batch_size_like(
true_logits, shape=[-1, neg_num], value=0.0, dtype='float32')
true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits,
label_ones)
neg_xent = fluid.layers.sigmoid_cross_entropy_with_logits(neg_logits,
label_zeros)
cost = fluid.layers.elementwise_add(
fluid.layers.reduce_sum(
true_xent, dim=1),
fluid.layers.reduce_sum(
neg_xent, dim=1))
self.avg_cost = fluid.layers.reduce_mean(cost)
global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True
def avg_loss(self): logits = fluid.layers.concat([true_logits, neg_logits], axis=0)
self._cost = self.avg_cost label_ones = fluid.layers.fill_constant(
shape=[fluid.layers.shape(true_logits)[0], 1],
value=1.0,
dtype='float32')
label_zeros = fluid.layers.fill_constant(
shape=[fluid.layers.shape(neg_logits)[0], 1],
value=0.0,
dtype='float32')
label = fluid.layers.concat([label_ones, label_zeros], axis=0)
def metrics(self): loss = fluid.layers.log_loss(fluid.layers.sigmoid(logits), label)
self._metrics["LOSS"] = self.avg_cost avg_cost = fluid.layers.reduce_sum(loss)
def train_net(self): self._cost = avg_cost
self.input() self._metrics["LOSS"] = avg_cost
self.net()
self.avg_loss()
self.metrics()
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
self._namespace)
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
self._namespace)
optimizer = fluid.optimizer.SGD( optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay( learning_rate=fluid.layers.exponential_decay(
learning_rate=learning_rate, learning_rate=self.learning_rate,
decay_steps=decay_steps, decay_steps=self.decay_steps,
decay_rate=decay_rate, decay_rate=self.decay_rate,
staircase=True)) staircase=True))
return optimizer return optimizer
...@@ -213,28 +162,22 @@ class Model(ModelBase): ...@@ -213,28 +162,22 @@ class Model(ModelBase):
use_double_buffer=False, use_double_buffer=False,
iterable=False) iterable=False)
def infer_net(self): def infer_net(self, inputs):
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
def embedding_layer(input, table_name, initializer_instance=None): def embedding_layer(input, table_name, initializer_instance=None):
emb = fluid.embedding( emb = fluid.embedding(
input=input, input=input,
size=[sparse_feature_number, sparse_feature_dim], size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=table_name) param_attr=table_name)
return emb return emb
self.analogy_input() all_label = np.arange(self.sparse_feature_number).reshape(
all_label = np.arange(sparse_feature_number).reshape( self.sparse_feature_number).astype('int32')
sparse_feature_number).astype('int32')
self.all_label = fluid.layers.cast( self.all_label = fluid.layers.cast(
x=fluid.layers.assign(all_label), dtype='int64') x=fluid.layers.assign(all_label), dtype='int64')
emb_all_label = embedding_layer(self.all_label, "emb") emb_all_label = embedding_layer(self.all_label, "emb")
emb_a = embedding_layer(self.analogy_a, "emb") emb_a = embedding_layer(inputs[0], "emb")
emb_b = embedding_layer(self.analogy_b, "emb") emb_b = embedding_layer(inputs[1], "emb")
emb_c = embedding_layer(self.analogy_c, "emb") emb_c = embedding_layer(inputs[2], "emb")
target = fluid.layers.elementwise_add( target = fluid.layers.elementwise_add(
fluid.layers.elementwise_sub(emb_b, emb_a), emb_c) fluid.layers.elementwise_sub(emb_b, emb_a), emb_c)
...@@ -245,34 +188,34 @@ class Model(ModelBase): ...@@ -245,34 +188,34 @@ class Model(ModelBase):
values, pred_idx = fluid.layers.topk(input=dist, k=4) values, pred_idx = fluid.layers.topk(input=dist, k=4)
label = fluid.layers.expand( label = fluid.layers.expand(
fluid.layers.unsqueeze( fluid.layers.unsqueeze(
self.analogy_d, axes=[1]), inputs[3], axes=[1]), expand_times=[1, 4])
expand_times=[1, 4])
label_ones = fluid.layers.fill_constant_batch_size_like( label_ones = fluid.layers.fill_constant_batch_size_like(
label, shape=[-1, 1], value=1.0, dtype='float32') label, shape=[-1, 1], value=1.0, dtype='float32')
right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast( right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
fluid.layers.equal(pred_idx, label), dtype='float32')) fluid.layers.equal(pred_idx, label), dtype='float32'))
total_cnt = fluid.layers.reduce_sum(label_ones) total_cnt = fluid.layers.reduce_sum(label_ones)
global_right_cnt = fluid.layers.create_global_var( # global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt", # name="global_right_cnt",
persistable=True, # persistable=True,
dtype='float32', # dtype='float32',
shape=[1], # shape=[1],
value=0) # value=0)
global_total_cnt = fluid.layers.create_global_var( # global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt", # name="global_total_cnt",
persistable=True, # persistable=True,
dtype='float32', # dtype='float32',
shape=[1], # shape=[1],
value=0) # value=0)
global_right_cnt.stop_gradient = True # global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True # global_total_cnt.stop_gradient = True
tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt) # tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt)
fluid.layers.assign(tmp1, global_right_cnt) # fluid.layers.assign(tmp1, global_right_cnt)
tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt) # tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt)
fluid.layers.assign(tmp2, global_total_cnt) # fluid.layers.assign(tmp2, global_total_cnt)
acc = fluid.layers.elementwise_div( # acc = fluid.layers.elementwise_div(
global_right_cnt, global_total_cnt, name="total_acc") # global_right_cnt, global_total_cnt, name="total_acc")
acc = fluid.layers.elementwise_div(right_cnt, total_cnt, name="acc")
self._infer_results['acc'] = acc self._infer_results['acc'] = acc
...@@ -20,10 +20,10 @@ from paddlerec.core.reader import Reader ...@@ -20,10 +20,10 @@ from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs from paddlerec.core.utils import envs
class EvaluateReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
dict_path = envs.get_global_env("word_id_dict_path", None, dict_path = envs.get_global_env(
"evaluate.reader") "dataset.dataset_infer.word_id_dict_path")
self.word_to_id = dict() self.word_to_id = dict()
self.id_to_word = dict() self.id_to_word = dict()
with io.open(dict_path, 'r', encoding='utf-8') as f: with io.open(dict_path, 'r', encoding='utf-8') as f:
......
...@@ -40,14 +40,12 @@ class NumpyRandomInt(object): ...@@ -40,14 +40,12 @@ class NumpyRandomInt(object):
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
dict_path = envs.get_global_env("word_count_dict_path", None, dict_path = envs.get_global_env(
"train.reader") "dataset.dataset_train.word_count_dict_path")
self.window_size = envs.get_global_env("hyper_parameters.window_size", self.window_size = envs.get_global_env("hyper_parameters.window_size")
None, "train.model") self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None,
"train.model")
self.with_shuffle_batch = envs.get_global_env( self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch", None, "train.model") "hyper_parameters.with_shuffle_batch")
self.random_generator = NumpyRandomInt(1, self.window_size + 1) self.random_generator = NumpyRandomInt(1, self.window_size + 1)
self.cs = None self.cs = None
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册