未验证 提交 8a9d286c 编写于 作者: W wuzhihua 提交者: GitHub

Merge branch 'master' into doc_v7

......@@ -149,11 +149,13 @@ class Model(object):
return optimizer_i
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = envs.get_global_env("hyper_parameters.optimizer", None,
self._namespace)
return self._build_optimizer(optimizer, learning_rate)
opt_name = envs.get_global_env("hyper_parameters.optimizer.class")
opt_lr = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")
opt_strategy = envs.get_global_env(
"hyper_parameters.optimizer.strategy")
return self._build_optimizer(opt_name, opt_lr, opt_strategy)
def input_data(self, is_infer=False, **kwargs):
name = "dataset." + kwargs.get("dataset_name") + "."
......
......@@ -167,6 +167,7 @@ class SingleInfer(TranspileTrainer):
model = envs.lazy_instance_by_fliename(
model_path, "Model")(self._env)
model._infer_data_var = model.input_data(
is_infer=True,
dataset_name=model_dict["dataset_name"])
if envs.get_global_env("dataset." + dataset_name +
".type") == "DataLoader":
......
......@@ -147,11 +147,6 @@ class SingleTrainer(TranspileTrainer):
startup_program = fluid.Program()
scope = fluid.Scope()
dataset_name = model_dict["dataset_name"]
opt_name = envs.get_global_env("hyper_parameters.optimizer.class")
opt_lr = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")
opt_strategy = envs.get_global_env(
"hyper_parameters.optimizer.strategy")
with fluid.program_guard(train_program, startup_program):
with fluid.unique_name.guard():
with fluid.scope_guard(scope):
......@@ -168,8 +163,7 @@ class SingleTrainer(TranspileTrainer):
self._get_dataloader(dataset_name,
model._data_loader)
model.net(model._data_var, False)
optimizer = model._build_optimizer(opt_name, opt_lr,
opt_strategy)
optimizer = model.optimizer()
optimizer.minimize(model._cost)
self._model[model_dict["name"]][0] = train_program
self._model[model_dict["name"]][1] = startup_program
......@@ -234,10 +228,12 @@ class SingleTrainer(TranspileTrainer):
scope = self._model[model_name][2]
program = self._model[model_name][0]
reader = self._dataset[reader_name]
threads = model_dict.get("thread_num", 1)
with fluid.scope_guard(scope):
self._exe.train_from_dataset(
program=program,
dataset=reader,
thread=threads,
fetch_list=fetch_vars,
fetch_info=fetch_alias,
print_period=fetch_period)
......@@ -247,8 +243,23 @@ class SingleTrainer(TranspileTrainer):
model_name = model_dict["name"]
model_class = self._model[model_name][3]
program = self._model[model_name][0].clone()
_build_strategy = fluid.BuildStrategy()
_exe_strategy = fluid.ExecutionStrategy()
# 0: kCoeffNumDevice; 1: One; 2: Customized
_build_strategy.gradient_scale_strategy = model_dict.get(
"gradient_scale_strategy", 0)
if "thread_num" in model_dict and model_dict["thread_num"] > 1:
_build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
_exe_strategy.num_threads = model_dict["thread_num"]
os.environ['CPU_NUM'] = str(_exe_strategy.num_threads)
program = fluid.compiler.CompiledProgram(program).with_data_parallel(
loss_name=model_class.get_avg_cost().name)
loss_name=model_class.get_avg_cost().name,
build_strategy=_build_strategy,
exec_strategy=_exe_strategy)
fetch_vars = []
fetch_alias = []
fetch_period = int(
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -12,28 +12,37 @@
# See the License for the specific language governing permissions and
# limitations under the License.
train:
trainer:
# for cluster training
strategy: "async"
workspace: "paddlerec.models.contentunderstanding.classification"
epochs: 10
workspace: "paddlerec.models.contentunderstanding.classification"
dataset:
- name: data1
batch_size: 5
type: DataLoader
data_path: "{workspace}/data/train_data"
data_converter: "{workspace}/reader.py"
hyper_parameters:
optimizer:
class: Adagrad
learning_rate: 0.001
is_sparse: False
reader:
batch_size: 5
class: "{workspace}/reader.py"
train_data_path: "{workspace}/train_data"
mode: runner1
model:
models: "{workspace}/model.py"
runner:
- name: runner1
class: single_train
epochs: 10
device: cpu
save_checkpoint_interval: 2
save_inference_interval: 4
save_checkpoint_path: "increment"
save_inference_path: "inference"
save_inference_feed_varnames: []
save_inference_fetch_varnames: []
save:
increment:
dirname: "increment"
epoch_interval: 1
save_last: True
inference:
dirname: "inference"
epoch_interval: 100
save_last: True
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: data1
thread_num: 1
......@@ -27,19 +27,27 @@ class Model(ModelBase):
self.emb_dim = 8
self.hid_dim = 128
self.class_dim = 2
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
False)
def train_net(self):
""" network definition """
def input_data(self, is_infer=False, **kwargs):
data = fluid.data(
name="input", shape=[None, self.max_len], dtype='int64')
label = fluid.data(name="label", shape=[None, 1], dtype='int64')
seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64')
return [data, label, seq_len]
self._data_var = [data, label, seq_len]
def net(self, input, is_infer=False):
""" network definition """
data = input[0]
label = input[1]
seq_len = input[2]
# embedding layer
emb = fluid.embedding(input=data, size=[self.dict_dim, self.emb_dim])
emb = fluid.embedding(
input=data,
size=[self.dict_dim, self.emb_dim],
is_sparse=self.is_sparse)
emb = fluid.layers.sequence_unpad(emb, length=seq_len)
# convolution layer
conv = fluid.nets.sequence_conv_pool(
......@@ -59,19 +67,8 @@ class Model(ModelBase):
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
self.cost = avg_cost
self._metrics["acc"] = acc
def get_avg_cost(self):
return self.cost
def get_metrics(self):
return self._metrics
def optimizer(self):
learning_rate = 0.01
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
return sgd_optimizer
def infer_net(self):
self.train_net()
self._cost = avg_cost
if is_infer:
self._infer_results["acc"] = acc
else:
self._metrics["acc"] = acc
......@@ -22,7 +22,7 @@ class TrainReader(Reader):
pass
def _process_line(self, l):
l = l.strip().split(" ")
l = l.strip().split()
data = l[0:10]
seq_len = l[10:11]
label = l[11:]
......@@ -37,8 +37,6 @@ class TrainReader(Reader):
data = [int(i) for i in data]
label = [int(i) for i in label]
seq_len = [int(i) for i in seq_len]
print >> sys.stderr, str(
[('data', data), ('label', label), ('seq_len', seq_len)])
yield [('data', data), ('label', label), ('seq_len', seq_len)]
return data_iter
......@@ -37,7 +37,18 @@
<img align="center" src="../../doc/imgs/cnn-ckim2014.png">
<p>
## 使用教程
##使用教程(快速开始)
```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.tagspace
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
```
## 使用教程(复现论文)
###注意
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果请使用以下提供的脚本下载对应数据集以及数据预处理。
### 数据处理
**(1)TagSpace**
......@@ -64,20 +75,42 @@ mv test.csv raw_big_test_data
python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test_big_data big_vocab_text.txt big_vocab_tag.txt
```
**(2)Classification**
### 训练
```
cd modles/contentunderstanding/tagspace
python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配置文件,使用自定义配置
```
### 预测
```
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer
# 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行:
python -m paddlerec.run -m ./config.yaml
```
### 训练
**(2)Classification**
### 训练
```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
cd modles/contentunderstanding/classification
python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配置文件,使用自定义配置
```
### 预测
```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer
# 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行:
python -m paddlerec.run -m ./config.yaml
```
## 效果对比
......
......@@ -12,38 +12,44 @@
# See the License for the specific language governing permissions and
# limitations under the License.
train:
trainer:
# for cluster training
strategy: "async"
workspace: "paddlerec.models.contentunderstanding.tagspace"
epochs: 10
workspace: "paddlerec.models.contentunderstanding.tagspace"
dataset:
- name: sample_1
type: QueueDataset
batch_size: 5
data_path: "{workspace}/data/train_data"
data_converter: "{workspace}/reader.py"
reader:
batch_size: 5
class: "{workspace}/reader.py"
train_data_path: "{workspace}/train_data"
hyper_parameters:
optimizer:
class: Adagrad
learning_rate: 0.001
vocab_text_size: 11447
vocab_tag_size: 4
emb_dim: 10
hid_dim: 1000
win_size: 5
margin: 0.1
neg_size: 3
num_devices: 1
model:
models: "{workspace}/model.py"
hyper_parameters:
vocab_text_size: 11447
vocab_tag_size: 4
emb_dim: 10
hid_dim: 1000
win_size: 5
margin: 0.1
neg_size: 3
num_devices: 1
mode: runner1
runner:
- name: runner1
class: single_train
epochs: 10
device: cpu
save_checkpoint_interval: 2
save_inference_interval: 4
save_checkpoint_path: "increment"
save_inference_path: "inference"
save_inference_feed_varnames: []
save_inference_fetch_varnames: []
save:
increment:
dirname: "increment"
epoch_interval: 1
save_last: True
inference:
dirname: "inference"
epoch_interval: 100
save_last: True
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: sample_1
thread_num: 1
......@@ -26,26 +26,30 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
self.cost = None
self.metrics = {}
self.vocab_text_size = envs.get_global_env("vocab_text_size", None,
self._namespace)
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None,
self._namespace)
self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace)
self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace)
self.win_size = envs.get_global_env("win_size", None, self._namespace)
self.margin = envs.get_global_env("margin", None, self._namespace)
self.neg_size = envs.get_global_env("neg_size", None, self._namespace)
self.vocab_text_size = envs.get_global_env(
"hyper_parameters.vocab_text_size")
self.vocab_tag_size = envs.get_global_env(
"hyper_parameters.vocab_tag_size")
self.emb_dim = envs.get_global_env("hyper_parameters.emb_dim")
self.hid_dim = envs.get_global_env("hyper_parameters.hid_dim")
self.win_size = envs.get_global_env("hyper_parameters.win_size")
self.margin = envs.get_global_env("hyper_parameters.margin")
self.neg_size = envs.get_global_env("hyper_parameters.neg_size")
def train_net(self):
""" network"""
def input_data(self, is_infer=False, **kwargs):
text = fluid.data(
name="text", shape=[None, 1], lod_level=1, dtype='int64')
pos_tag = fluid.data(
name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64')
neg_tag = fluid.data(
name="neg_tag", shape=[None, 1], lod_level=1, dtype='int64')
return [text, pos_tag, neg_tag]
self._data_var = [text, pos_tag, neg_tag]
def net(self, input, is_infer=False):
""" network"""
text = input[0]
pos_tag = input[1]
neg_tag = input[2]
text_emb = fluid.embedding(
input=text,
......@@ -97,22 +101,11 @@ class Model(ModelBase):
avg_cost = nn.mean(loss_part3)
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less)
self.cost = avg_cost
self.metrics["correct"] = correct
self.metrics["cos_pos"] = cos_pos
def get_avg_cost(self):
return self.cost
def get_metrics(self):
return self.metrics
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.base_lr", None,
self._namespace)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
return sgd_optimizer
self._cost = avg_cost
def infer_net(self, parameter_list):
self.train_net()
if is_infer:
self._infer_results["correct"] = correct
self._infer_results["cos_pos"] = cos_pos
else:
self._metrics["correct"] = correct
self._metrics["cos_pos"] = cos_pos
......@@ -11,44 +11,66 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
evaluate:
reader:
batch_size: 1
class: "{workspace}/synthetic_evaluate_reader.py"
test_data_path: "{workspace}/data/train"
train:
trainer:
# for cluster training
strategy: "async"
epochs: 4
workspace: "paddlerec.models.match.dssm"
reader:
batch_size: 4
class: "{workspace}/synthetic_reader.py"
train_data_path: "{workspace}/data/train"
workspace: "paddlerec.models.match.dssm"
dataset:
- name: dataset_train
batch_size: 4
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/synthetic_reader.py"
- name: dataset_infer
batch_size: 1
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/synthetic_evaluate_reader.py"
model:
models: "{workspace}/model.py"
hyper_parameters:
TRIGRAM_D: 1000
NEG: 4
fc_sizes: [300, 300, 128]
fc_acts: ['tanh', 'tanh', 'tanh']
learning_rate: 0.01
optimizer: sgd
hyper_parameters:
optimizer:
class: sgd
learning_rate: 0.01
strategy: async
trigram_d: 1000
neg_num: 4
fc_sizes: [300, 300, 128]
fc_acts: ['tanh', 'tanh', 'tanh']
save:
increment:
dirname: "increment"
epoch_interval: 2
save_last: True
mode: train_runner
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner:
- name: train_runner
class: single_train
# num of epochs
epochs: 4
# device to run training or infer
device: cpu
save_checkpoint_interval: 2 # save model interval of epochs
save_inference_interval: 4 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_inference_feed_varnames: ["query", "doc_pos"] # feed vars of save inference
save_inference_fetch_varnames: ["cos_sim_0.tmp_0"] # fetch vars of save inference
init_model_path: "" # load model path
fetch_period: 2
- name: infer_runner
class: single_infer
# num of epochs
epochs: 1
# device to run training or infer
device: cpu
fetch_period: 1
init_model_path: "increment/2" # load model path
inference:
dirname: "inference"
epoch_interval: 4
feed_varnames: ["query", "doc_pos"]
fetch_varnames: ["cos_sim_0.tmp_0"]
save_last: True
# runner will run all the phase in each epoch
phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
......@@ -22,45 +22,39 @@ class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def input(self):
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
self._namespace)
Neg = envs.get_global_env("hyper_parameters.NEG", None,
self._namespace)
self.query = fluid.data(
name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(
def _init_hyper_parameters(self):
self.trigram_d = envs.get_global_env("hyper_parameters.trigram_d")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes")
self.hidden_acts = envs.get_global_env("hyper_parameters.fc_acts")
self.learning_rate = envs.get_global_env(
"hyper_parameters.learning_rate")
def input_data(self, is_infer=False, **kwargs):
query = fluid.data(
name="query",
shape=[-1, self.trigram_d],
dtype='float32',
lod_level=0)
doc_pos = fluid.data(
name="doc_pos",
shape=[-1, TRIGRAM_D],
shape=[-1, self.trigram_d],
dtype='float32',
lod_level=0)
self.doc_negs = [
if is_infer:
return [query, doc_pos]
doc_negs = [
fluid.data(
name="doc_neg_" + str(i),
shape=[-1, TRIGRAM_D],
shape=[-1, self.trigram_d],
dtype="float32",
lod_level=0) for i in range(Neg)
lod_level=0) for i in range(self.neg_num)
]
self._data_var.append(self.query)
self._data_var.append(self.doc_pos)
for input in self.doc_negs:
self._data_var.append(input)
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def net(self, is_infer=False):
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None,
self._namespace)
hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None,
self._namespace)
return [query, doc_pos] + doc_negs
def net(self, inputs, is_infer=False):
def fc(data, hidden_layers, hidden_acts, names):
fc_inputs = [data]
for i in range(len(hidden_layers)):
......@@ -77,71 +71,30 @@ class Model(ModelBase):
fc_inputs.append(out)
return fc_inputs[-1]
query_fc = fc(self.query, hidden_layers, hidden_acts,
query_fc = fc(inputs[0], self.hidden_layers, self.hidden_acts,
['query_l1', 'query_l2', 'query_l3'])
doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts,
doc_pos_fc = fc(inputs[1], self.hidden_layers, self.hidden_acts,
['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3'])
self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc)
R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc)
if is_infer:
self._infer_results["query_doc_sim"] = R_Q_D_p
return
R_Q_D_ns = []
for i, doc_neg in enumerate(self.doc_negs):
doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, [
'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
'doc_neg_l3_' + str(i)
])
for i in range(len(inputs) - 2):
doc_neg_fc_i = fc(
inputs[i + 2], self.hidden_layers, self.hidden_acts, [
'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
'doc_neg_l3_' + str(i)
])
R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i))
concat_Rs = fluid.layers.concat(
input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1)
concat_Rs = fluid.layers.concat(input=[R_Q_D_p] + R_Q_D_ns, axis=-1)
prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice(
prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
self.avg_cost = fluid.layers.mean(x=loss)
def infer_results(self):
self._infer_results['query_doc_sim'] = self.R_Q_D_p
def avg_loss(self):
self._cost = self.avg_cost
def metrics(self):
self._metrics["LOSS"] = self.avg_cost
def train_net(self):
self.input()
self.net(is_infer=False)
self.avg_loss()
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.SGD(learning_rate)
return optimizer
def infer_input(self):
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
self._namespace)
self.query = fluid.data(
name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(
name="doc_pos",
shape=[-1, TRIGRAM_D],
dtype='float32',
lod_level=0)
self._infer_data_var = [self.query, self.doc_pos]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
self.infer_input()
self.net(is_infer=True)
self.infer_results()
avg_cost = fluid.layers.mean(x=loss)
self._cost = avg_cost
self._metrics["LOSS"] = avg_cost
......@@ -16,7 +16,7 @@ from __future__ import print_function
from paddlerec.core.reader import Reader
class EvaluateReader(Reader):
class TrainReader(Reader):
def init(self):
pass
......
......@@ -11,49 +11,73 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
evaluate:
workspace: "paddlerec.models.match.multiview-simnet"
reader:
batch_size: 2
class: "{workspace}/evaluate_reader.py"
test_data_path: "{workspace}/data/test"
train:
trainer:
# for cluster training
strategy: "async"
# workspace
workspace: "paddlerec.models.match.multiview-simnet"
epochs: 2
workspace: "paddlerec.models.match.multiview-simnet"
# list of dataset
dataset:
- name: dataset_train # name of dataset to distinguish different datasets
batch_size: 2
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/train"
sparse_slots: "1 2 3"
- name: dataset_infer # name
batch_size: 2
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/test"
sparse_slots: "1 2"
reader:
batch_size: 2
class: "{workspace}/reader.py"
train_data_path: "{workspace}/data/train"
dataset_class: "DataLoader"
# hyper parameters of user-defined network
hyper_parameters:
optimizer:
class: Adam
learning_rate: 0.0001
strategy: async
query_encoder: "bow"
title_encoder: "bow"
query_encode_dim: 128
title_encode_dim: 128
sparse_feature_dim: 1000001
embedding_dim: 128
hidden_size: 128
margin: 0.1
model:
models: "{workspace}/model.py"
hyper_parameters:
use_DataLoader: True
query_encoder: "bow"
title_encoder: "bow"
query_encode_dim: 128
title_encode_dim: 128
query_slots: 1
title_slots: 1
sparse_feature_dim: 1000001
embedding_dim: 128
hidden_size: 128
learning_rate: 0.0001
optimizer: adam
# select runner by name
mode: train_runner
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner:
- name: train_runner
class: single_train
# num of epochs
epochs: 2
# device to run training or infer
device: cpu
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path
fetch_period: 1
- name: infer_runner
class: single_infer
# num of epochs
epochs: 1
# device to run training or infer
device: cpu
fetch_period: 1
init_model_path: "increment/0" # load model path
save:
increment:
dirname: "increment"
epoch_interval: 1
save_last: True
inference:
dirname: "inference"
epoch_interval: 1
save_last: True
# runner will run all the phase in each epoch
phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
......@@ -99,143 +99,89 @@ class SimpleEncoderFactory(object):
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
self.init_config()
def init_config(self):
self._fetch_interval = 1
query_encoder = envs.get_global_env("hyper_parameters.query_encoder",
None, self._namespace)
title_encoder = envs.get_global_env("hyper_parameters.title_encoder",
None, self._namespace)
query_encode_dim = envs.get_global_env(
"hyper_parameters.query_encode_dim", None, self._namespace)
title_encode_dim = envs.get_global_env(
"hyper_parameters.title_encode_dim", None, self._namespace)
query_slots = envs.get_global_env("hyper_parameters.query_slots", None,
self._namespace)
title_slots = envs.get_global_env("hyper_parameters.title_slots", None,
self._namespace)
factory = SimpleEncoderFactory()
self.query_encoders = [
factory.create(query_encoder, query_encode_dim)
for i in range(query_slots)
]
self.title_encoders = [
factory.create(title_encoder, title_encode_dim)
for i in range(title_slots)
]
def _init_hyper_parameters(self):
self.query_encoder = envs.get_global_env(
"hyper_parameters.query_encoder")
self.title_encoder = envs.get_global_env(
"hyper_parameters.title_encoder")
self.query_encode_dim = envs.get_global_env(
"hyper_parameters.query_encode_dim")
self.title_encode_dim = envs.get_global_env(
"hyper_parameters.title_encode_dim")
self.emb_size = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim",
None, self._namespace)
"hyper_parameters.sparse_feature_dim")
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim")
self.emb_shape = [self.emb_size, self.emb_dim]
self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size",
None, self._namespace)
self.margin = 0.1
def input(self, is_train=True):
self.q_slots = [
fluid.data(
name="%d" % i, shape=[None, 1], lod_level=1, dtype='int64')
for i in range(len(self.query_encoders))
]
self.pt_slots = [
fluid.data(
name="%d" % (i + len(self.query_encoders)),
shape=[None, 1],
lod_level=1,
dtype='int64') for i in range(len(self.title_encoders))
]
if is_train == False:
return self.q_slots + self.pt_slots
self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size")
self.margin = envs.get_global_env("hyper_parameters.margin")
self.nt_slots = [
fluid.data(
name="%d" %
(i + len(self.query_encoders) + len(self.title_encoders)),
shape=[None, 1],
lod_level=1,
dtype='int64') for i in range(len(self.title_encoders))
def net(self, input, is_infer=False):
factory = SimpleEncoderFactory()
self.q_slots = self._sparse_data_var[0:1]
self.query_encoders = [
factory.create(self.query_encoder, self.query_encode_dim)
for _ in self.q_slots
]
return self.q_slots + self.pt_slots + self.nt_slots
def train_input(self):
res = self.input()
self._data_var = res
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
False, self._namespace)
if self._platform != "LINUX" or use_dataloader:
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var,
capacity=256,
use_double_buffer=False,
iterable=False)
def get_acc(self, x, y):
less = tensor.cast(cf.less_than(x, y), dtype='float32')
label_ones = fluid.layers.fill_constant_batch_size_like(
input=x, dtype='float32', shape=[-1, 1], value=1.0)
correct = fluid.layers.reduce_sum(less)
total = fluid.layers.reduce_sum(label_ones)
acc = fluid.layers.elementwise_div(correct, total)
return acc
def net(self):
q_embs = [
fluid.embedding(
input=query, size=self.emb_shape, param_attr="emb")
for query in self.q_slots
]
pt_embs = [
fluid.embedding(
input=title, size=self.emb_shape, param_attr="emb")
for title in self.pt_slots
]
nt_embs = [
fluid.embedding(
input=title, size=self.emb_shape, param_attr="emb")
for title in self.nt_slots
]
# encode each embedding field with encoder
q_encodes = [
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
]
pt_encodes = [
self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
]
nt_encodes = [
self.title_encoders[i].forward(emb)
for i, emb in enumerate(nt_embs)
]
# concat multi view for query, pos_title, neg_title
q_concat = fluid.layers.concat(q_encodes)
pt_concat = fluid.layers.concat(pt_encodes)
nt_concat = fluid.layers.concat(nt_encodes)
# projection of hidden layer
q_hid = fluid.layers.fc(q_concat,
size=self.hidden_size,
param_attr='q_fc.w',
bias_attr='q_fc.b')
self.pt_slots = self._sparse_data_var[1:2]
self.title_encoders = [
factory.create(self.title_encoder, self.title_encode_dim)
]
pt_embs = [
fluid.embedding(
input=title, size=self.emb_shape, param_attr="emb")
for title in self.pt_slots
]
pt_encodes = [
self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
]
pt_concat = fluid.layers.concat(pt_encodes)
pt_hid = fluid.layers.fc(pt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
# cosine of hidden layers
cos_pos = fluid.layers.cos_sim(q_hid, pt_hid)
if is_infer:
self._infer_results['query_pt_sim'] = cos_pos
return
self.nt_slots = self._sparse_data_var[2:3]
nt_embs = [
fluid.embedding(
input=title, size=self.emb_shape, param_attr="emb")
for title in self.nt_slots
]
nt_encodes = [
self.title_encoders[i].forward(emb)
for i, emb in enumerate(nt_embs)
]
nt_concat = fluid.layers.concat(nt_encodes)
nt_hid = fluid.layers.fc(nt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
# cosine of hidden layers
cos_pos = fluid.layers.cos_sim(q_hid, pt_hid)
cos_neg = fluid.layers.cos_sim(q_hid, nt_hid)
# pairwise hinge_loss
......@@ -254,72 +200,16 @@ class Model(ModelBase):
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
self.avg_cost = fluid.layers.mean(loss_part3)
self._cost = fluid.layers.mean(loss_part3)
self.acc = self.get_acc(cos_neg, cos_pos)
def avg_loss(self):
self._cost = self.avg_cost
def metrics(self):
self._metrics["loss"] = self.avg_cost
self._metrics["loss"] = self._cost
self._metrics["acc"] = self.acc
def train_net(self):
self.train_input()
self.net()
self.avg_loss()
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
return optimizer
def infer_input(self):
res = self.input(is_train=False)
self._infer_data_var = res
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
self.infer_input()
# lookup embedding for each slot
q_embs = [
fluid.embedding(
input=query, size=self.emb_shape, param_attr="emb")
for query in self.q_slots
]
pt_embs = [
fluid.embedding(
input=title, size=self.emb_shape, param_attr="emb")
for title in self.pt_slots
]
# encode each embedding field with encoder
q_encodes = [
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
]
pt_encodes = [
self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
]
# concat multi view for query, pos_title, neg_title
q_concat = fluid.layers.concat(q_encodes)
pt_concat = fluid.layers.concat(pt_encodes)
# projection of hidden layer
q_hid = fluid.layers.fc(q_concat,
size=self.hidden_size,
param_attr='q_fc.w',
bias_attr='q_fc.b')
pt_hid = fluid.layers.fc(pt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
# cosine of hidden layers
cos = fluid.layers.cos_sim(q_hid, pt_hid)
self._infer_results['query_pt_sim'] = cos
def get_acc(self, x, y):
less = tensor.cast(cf.less_than(x, y), dtype='float32')
label_ones = fluid.layers.fill_constant_batch_size_like(
input=x, dtype='float32', shape=[-1, 1], value=1.0)
correct = fluid.layers.reduce_sum(less)
total = fluid.layers.reduce_sum(label_ones)
acc = fluid.layers.elementwise_div(correct, total)
return acc
......@@ -31,9 +31,21 @@
<img align="center" src="../../doc/imgs/multiview-simnet.png">
<p>
## 使用教程
### 训练&预测
## 使用教程(快速开始)
### 训练
```shell
python -m paddlerec.run -m paddlerec.models.match.dssm # dssm
python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet
```
### 预测
```shell
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer
# 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行:
python -m paddlerec.run -m ./config.yaml # 以dssm为例
```
......@@ -59,7 +59,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
self.cat_feat_idx_dict_list = [{} for _ in range(26)]
# TODO: set vocabulary dictionary
vocab_dir = "./vocab/"
vocab_dir = "./sample_data/vocab/"
for i in range(26):
lookup_idx = 1 # remain 0 for default value
for line in open(
......
......@@ -12,9 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import yaml
import yaml, os
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
import paddle.fluid.incubate.data_generator as dg
try:
import cPickle as pickle
except ImportError:
......@@ -44,7 +46,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
self.continuous_range_ = range(1, 14)
self.categorical_range_ = range(14, 40)
# load preprocessed feature dict
self.feat_dict_name = "aid_data/feat_dict_10.pkl2"
self.feat_dict_name = "sample_data/feat_dict_10.pkl2"
self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
def _process_line(self, line):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# global settings
debug: false
workspace: "paddlerec.models.rank.deepfm"
dataset:
- name: train_sample
type: QueueDataset
batch_size: 5
data_path: "{workspace}/data/sample_data/train"
sparse_slots: "label feat_idx"
dense_slots: "feat_value:39"
- name: infer_sample
type: QueueDataset
batch_size: 5
data_path: "{workspace}/data/sample_data/train"
sparse_slots: "label feat_idx"
dense_slots: "feat_value:39"
hyper_parameters:
optimizer:
class: SGD
learning_rate: 0.0001
sparse_feature_number: 1086460
sparse_feature_dim: 9
num_field: 39
reg: 0.001
mode: train_runner
# if infer, change mode to "infer_runner" and change phase to "infer_phase"
runner:
- name: train_runner
trainer_class: single_train
epochs: 2
device: cpu
init_model_path: ""
save_checkpoint_interval: 1
save_inference_interval: 1
save_checkpoint_path: "increment"
save_inference_path: "inference"
print_interval: 1
- name: infer_runner
trainer_class: single_infer
epochs: 1
device: cpu
init_model_path: "increment/0"
print_interval: 1
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: train_sample
thread_num: 1
#- name: infer_phase
# model: "{workspace}/model.py"
# dataset_name: infer_sample
# thread_num: 1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import sys
LOCAL_PATH = os.path.dirname(os.path.abspath(__file__))
TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools")
sys.path.append(TOOLS_PATH)
from paddlerec.tools.tools import download_file_and_uncompress, download_file
if __name__ == '__main__':
url = "https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz"
url2 = "https://paddlerec.bj.bcebos.com/deepfm%2Ffeat_dict_10.pkl2"
print("download and extract starting...")
download_file_and_uncompress(url)
download_file(url2, "./aid_data/feat_dict_10.pkl2", True)
print("download and extract finished")
print("preprocessing...")
os.system("python preprocess.py")
print("preprocess done")
shutil.rmtree("raw_data")
print("done")
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import yaml
import os
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
import paddle.fluid.incubate.data_generator as dg
try:
import cPickle as pickle
except ImportError:
import pickle
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
if os.path.isfile(config):
with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
raise ValueError("reader config only support yaml")
def init(self):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [
5775, 257675, 65535, 969, 23159456, 431037, 56311, 6047, 29019, 46,
231, 4008, 7393
]
self.cont_diff_ = [
self.cont_max_[i] - self.cont_min_[i]
for i in range(len(self.cont_min_))
]
self.continuous_range_ = range(1, 14)
self.categorical_range_ = range(14, 40)
# load preprocessed feature dict
self.feat_dict_name = "sample_data/feat_dict_10.pkl2"
self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
def _process_line(self, line):
features = line.rstrip('\n').split('\t')
feat_idx = []
feat_value = []
for idx in self.continuous_range_:
if features[idx] == '':
feat_idx.append(0)
feat_value.append(0.0)
else:
feat_idx.append(self.feat_dict_[idx])
feat_value.append(
(float(features[idx]) - self.cont_min_[idx - 1]) /
self.cont_diff_[idx - 1])
for idx in self.categorical_range_:
if features[idx] == '' or features[idx] not in self.feat_dict_:
feat_idx.append(0)
feat_value.append(0.0)
else:
feat_idx.append(self.feat_dict_[features[idx]])
feat_value.append(1.0)
label = [int(features[0])]
return feat_idx, feat_value, label
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
"""
def data_iter():
feat_idx, feat_value, label = self._process_line(line)
s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter
reader = TrainReader(
"../config.yaml") # run this file in original folder to find config.yaml
reader.init()
reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy
from collections import Counter
import shutil
import pickle
def get_raw_data():
if not os.path.isdir('raw_data'):
os.mkdir('raw_data')
fin = open('train.txt', 'r')
fout = open('raw_data/part-0', 'w')
for line_idx, line in enumerate(fin):
if line_idx % 200000 == 0 and line_idx != 0:
fout.close()
cur_part_idx = int(line_idx / 200000)
fout = open('raw_data/part-' + str(cur_part_idx), 'w')
fout.write(line)
fout.close()
fin.close()
def split_data():
split_rate_ = 0.9
dir_train_file_idx_ = 'aid_data/train_file_idx.txt'
filelist_ = [
'raw_data/part-%d' % x for x in range(len(os.listdir('raw_data')))
]
if not os.path.exists(dir_train_file_idx_):
train_file_idx = list(
numpy.random.choice(
len(filelist_), int(len(filelist_) * split_rate_), False))
with open(dir_train_file_idx_, 'w') as fout:
fout.write(str(train_file_idx))
else:
with open(dir_train_file_idx_, 'r') as fin:
train_file_idx = eval(fin.read())
for idx in range(len(filelist_)):
if idx in train_file_idx:
shutil.move(filelist_[idx], 'train_data')
else:
shutil.move(filelist_[idx], 'test_data')
def get_feat_dict():
freq_ = 10
dir_feat_dict_ = 'aid_data/feat_dict_' + str(freq_) + '.pkl2'
continuous_range_ = range(1, 14)
categorical_range_ = range(14, 40)
if not os.path.exists(dir_feat_dict_):
# Count the number of occurrences of discrete features
feat_cnt = Counter()
with open('train.txt', 'r') as fin:
for line_idx, line in enumerate(fin):
if line_idx % 100000 == 0:
print('generating feature dict', line_idx / 45000000)
features = line.rstrip('\n').split('\t')
for idx in categorical_range_:
if features[idx] == '': continue
feat_cnt.update([features[idx]])
# Only retain discrete features with high frequency
dis_feat_set = set()
for feat, ot in feat_cnt.items():
if ot >= freq_:
dis_feat_set.add(feat)
# Create a dictionary for continuous and discrete features
feat_dict = {}
tc = 1
# Continuous features
for idx in continuous_range_:
feat_dict[idx] = tc
tc += 1
for feat in dis_feat_set:
feat_dict[feat] = tc
tc += 1
# Save dictionary
with open(dir_feat_dict_, 'wb') as fout:
pickle.dump(feat_dict, fout, protocol=2)
print('args.num_feat ', len(feat_dict) + 1)
if __name__ == '__main__':
if not os.path.isdir('train_data'):
os.mkdir('train_data')
if not os.path.isdir('test_data'):
os.mkdir('test_data')
if not os.path.isdir('aid_data'):
os.mkdir('aid_data')
get_raw_data()
split_data()
get_feat_dict()
print('Done!')
python download_preprocess.py
mkdir slot_train_data
for i in `ls ./train_data`
do
cat train_data/$i | python get_slot_data.py > slot_train_data/$i
done
mkdir slot_test_data
for i in `ls ./test_data`
do
cat test_data/$i | python get_slot_data.py > slot_test_data/$i
done
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def _init_hyper_parameters(self):
self.sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None)
self.num_field = envs.get_global_env("hyper_parameters.num_field",
None)
self.reg = envs.get_global_env("hyper_parameters.reg", 1e-4)
def net(self, inputs, is_infer=False):
init_value_ = 0.1
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
# ------------------------- network input --------------------------
raw_feat_idx = self._sparse_data_var[1]
raw_feat_value = self._dense_data_var[0]
self.label = self._sparse_data_var[0]
feat_idx = raw_feat_idx
feat_value = fluid.layers.reshape(
raw_feat_value, [-1, self.num_field]) # None * num_field * 1
first_weights_re = fluid.embedding(
input=feat_idx,
is_sparse=True,
is_distributed=is_distributed,
dtype='float32',
size=[self.sparse_feature_number + 1, 1],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormalInitializer(
loc=0.0, scale=init_value_),
regularizer=fluid.regularizer.L1DecayRegularizer(self.reg)))
first_weights = fluid.layers.reshape(
first_weights_re,
shape=[-1, self.num_field]) # None * num_field * 1
y_first_order = fluid.layers.reduce_sum(
first_weights * feat_value, 1, keep_dim=True)
b_linear = fluid.layers.create_parameter(
shape=[1],
dtype='float32',
default_initializer=fluid.initializer.ConstantInitializer(value=0))
self.predict = fluid.layers.sigmoid(y_first_order + b_linear)
cost = fluid.layers.log_loss(
input=self.predict, label=fluid.layers.cast(self.label, "float32"))
avg_cost = fluid.layers.reduce_sum(cost)
self._cost = avg_cost
predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1)
label_int = fluid.layers.cast(self.label, 'int64')
auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d,
label=label_int,
slide_steps=0)
self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc_var
if is_infer:
self._infer_results["AUC"] = auc_var
......@@ -11,7 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import yaml
import yaml, os
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try:
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import yaml
import yaml, os
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try:
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.recall.fasttext"
# list of dataset
dataset:
- name: dataset_train # name of dataset to distinguish different datasets
batch_size: 10
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/train"
word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt"
word_ngrams_path: "{workspace}/data/dict/word_ngrams_id.txt"
data_converter: "{workspace}/reader.py"
- name: dataset_infer # name
batch_size: 10
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/test"
word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
data_converter: "{workspace}/evaluate_reader.py"
hyper_parameters:
optimizer:
learning_rate: 1.0
decay_steps: 100000
decay_rate: 0.999
class: sgd
strategy: async
sparse_feature_number: 227915
sparse_feature_dim: 300
with_shuffle_batch: False
neg_num: 5
window_size: 5
min_n: 3
max_n: 5
# select runner by name
mode: train_runner
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner:
- name: train_runner
class: single_train
# num of epochs
epochs: 2
# device to run training or infer
device: cpu
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path
fetch_period: 10
- name: infer_runner
class: single_infer
# num of epochs
epochs: 1
# device to run training or infer
device: cpu
init_model_path: "increment/0" # load model path
# runner will run all the phase in each epoch
phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 1
gradient_scale_strategy: 1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
此差异已折叠。
此差异已折叠。
此差异已折叠。
Athens Greece Baghdad Iraq
Athens Greece Bangkok Thailand
Athens Greece Beijing China
Athens Greece Berlin Germany
Athens Greece Bern Switzerland
Athens Greece Cairo Egypt
Athens Greece Canberra Australia
Athens Greece Hanoi Vietnam
Athens Greece Havana Cuba
Athens Greece Helsinki Finland
Athens Greece Islamabad Pakistan
Athens Greece Kabul Afghanistan
Athens Greece London England
Athens Greece Madrid Spain
Athens Greece Moscow Russia
Athens Greece Oslo Norway
Athens Greece Ottawa Canada
Athens Greece Paris France
Athens Greece Rome Italy
Athens Greece Stockholm Sweden
Athens Greece Tehran Iran
Athens Greece Tokyo Japan
Baghdad Iraq Bangkok Thailand
Baghdad Iraq Beijing China
Baghdad Iraq Berlin Germany
Baghdad Iraq Bern Switzerland
Baghdad Iraq Cairo Egypt
Baghdad Iraq Canberra Australia
Baghdad Iraq Hanoi Vietnam
Baghdad Iraq Havana Cuba
Baghdad Iraq Helsinki Finland
Baghdad Iraq Islamabad Pakistan
Baghdad Iraq Kabul Afghanistan
Baghdad Iraq London England
Baghdad Iraq Madrid Spain
Baghdad Iraq Moscow Russia
Baghdad Iraq Oslo Norway
Baghdad Iraq Ottawa Canada
Baghdad Iraq Paris France
Baghdad Iraq Rome Italy
Baghdad Iraq Stockholm Sweden
Baghdad Iraq Tehran Iran
Baghdad Iraq Tokyo Japan
Baghdad Iraq Athens Greece
Bangkok Thailand Beijing China
Bangkok Thailand Berlin Germany
Bangkok Thailand Bern Switzerland
Bangkok Thailand Cairo Egypt
Bangkok Thailand Canberra Australia
Bangkok Thailand Hanoi Vietnam
Bangkok Thailand Havana Cuba
Bangkok Thailand Helsinki Finland
Bangkok Thailand Islamabad Pakistan
Bangkok Thailand Kabul Afghanistan
Bangkok Thailand London England
Bangkok Thailand Madrid Spain
Bangkok Thailand Moscow Russia
Bangkok Thailand Oslo Norway
Bangkok Thailand Ottawa Canada
Bangkok Thailand Paris France
Bangkok Thailand Rome Italy
Bangkok Thailand Stockholm Sweden
Bangkok Thailand Tehran Iran
Bangkok Thailand Tokyo Japan
Bangkok Thailand Athens Greece
Bangkok Thailand Baghdad Iraq
Beijing China Berlin Germany
Beijing China Bern Switzerland
Beijing China Cairo Egypt
Beijing China Canberra Australia
Beijing China Hanoi Vietnam
Beijing China Havana Cuba
Beijing China Helsinki Finland
Beijing China Islamabad Pakistan
Beijing China Kabul Afghanistan
Beijing China London England
Beijing China Madrid Spain
Beijing China Moscow Russia
Beijing China Oslo Norway
Beijing China Ottawa Canada
Beijing China Paris France
Beijing China Rome Italy
Beijing China Stockholm Sweden
Beijing China Tehran Iran
Beijing China Tokyo Japan
Beijing China Athens Greece
Beijing China Baghdad Iraq
Beijing China Bangkok Thailand
Berlin Germany Bern Switzerland
Berlin Germany Cairo Egypt
Berlin Germany Canberra Australia
Berlin Germany Hanoi Vietnam
Berlin Germany Havana Cuba
Berlin Germany Helsinki Finland
Berlin Germany Islamabad Pakistan
Berlin Germany Kabul Afghanistan
Berlin Germany London England
Berlin Germany Madrid Spain
Berlin Germany Moscow Russia
Berlin Germany Oslo Norway
Berlin Germany Ottawa Canada
Berlin Germany Paris France
Berlin Germany Rome Italy
Berlin Germany Stockholm Sweden
Berlin Germany Tehran Iran
Berlin Germany Tokyo Japan
Berlin Germany Athens Greece
Berlin Germany Baghdad Iraq
Berlin Germany Bangkok Thailand
Berlin Germany Beijing China
Bern Switzerland Cairo Egypt
Bern Switzerland Canberra Australia
Bern Switzerland Hanoi Vietnam
Bern Switzerland Havana Cuba
Bern Switzerland Helsinki Finland
Bern Switzerland Islamabad Pakistan
Bern Switzerland Kabul Afghanistan
Bern Switzerland London England
Bern Switzerland Madrid Spain
Bern Switzerland Moscow Russia
Bern Switzerland Oslo Norway
Bern Switzerland Ottawa Canada
Bern Switzerland Paris France
Bern Switzerland Rome Italy
Bern Switzerland Stockholm Sweden
Bern Switzerland Tehran Iran
Bern Switzerland Tokyo Japan
Bern Switzerland Athens Greece
Bern Switzerland Baghdad Iraq
Bern Switzerland Bangkok Thailand
Bern Switzerland Beijing China
Bern Switzerland Berlin Germany
Cairo Egypt Canberra Australia
Cairo Egypt Hanoi Vietnam
Cairo Egypt Havana Cuba
Cairo Egypt Helsinki Finland
Cairo Egypt Islamabad Pakistan
Cairo Egypt Kabul Afghanistan
Cairo Egypt London England
Cairo Egypt Madrid Spain
Cairo Egypt Moscow Russia
Cairo Egypt Oslo Norway
Cairo Egypt Ottawa Canada
Cairo Egypt Paris France
Cairo Egypt Rome Italy
Cairo Egypt Stockholm Sweden
Cairo Egypt Tehran Iran
Cairo Egypt Tokyo Japan
Cairo Egypt Athens Greece
Cairo Egypt Baghdad Iraq
Cairo Egypt Bangkok Thailand
Cairo Egypt Beijing China
Cairo Egypt Berlin Germany
Cairo Egypt Bern Switzerland
Canberra Australia Hanoi Vietnam
Canberra Australia Havana Cuba
Canberra Australia Helsinki Finland
Canberra Australia Islamabad Pakistan
Canberra Australia Kabul Afghanistan
Canberra Australia London England
Canberra Australia Madrid Spain
Canberra Australia Moscow Russia
Canberra Australia Oslo Norway
Canberra Australia Ottawa Canada
Canberra Australia Paris France
Canberra Australia Rome Italy
Canberra Australia Stockholm Sweden
Canberra Australia Tehran Iran
Canberra Australia Tokyo Japan
Canberra Australia Athens Greece
Canberra Australia Baghdad Iraq
Canberra Australia Bangkok Thailand
Canberra Australia Beijing China
Canberra Australia Berlin Germany
Canberra Australia Bern Switzerland
Canberra Australia Cairo Egypt
Hanoi Vietnam Havana Cuba
Hanoi Vietnam Helsinki Finland
Hanoi Vietnam Islamabad Pakistan
Hanoi Vietnam Kabul Afghanistan
Hanoi Vietnam London England
Hanoi Vietnam Madrid Spain
Hanoi Vietnam Moscow Russia
Hanoi Vietnam Oslo Norway
Hanoi Vietnam Ottawa Canada
Hanoi Vietnam Paris France
Hanoi Vietnam Rome Italy
Hanoi Vietnam Stockholm Sweden
Hanoi Vietnam Tehran Iran
Hanoi Vietnam Tokyo Japan
Hanoi Vietnam Athens Greece
Hanoi Vietnam Baghdad Iraq
Hanoi Vietnam Bangkok Thailand
Hanoi Vietnam Beijing China
Hanoi Vietnam Berlin Germany
Hanoi Vietnam Bern Switzerland
Hanoi Vietnam Cairo Egypt
Hanoi Vietnam Canberra Australia
Havana Cuba Helsinki Finland
Havana Cuba Islamabad Pakistan
183648 183648 183648 183648 65918 183648 94834 93002 71149 183648 89518 183648 68731 183648 183648 63766 183648 183648 63766 183648 183648 63690 63766 83291 183648 183648 63766 183648 183648 183648 63766 65918 183648 73068 71149 183648 183648 183648 65370 183648 183648 67665 63945 93281 71149 183648 139630 183648 183648 63766 183648 183648 183648 183648 65062 110843 175871 94491 183648 183648 183648 183648 89277 183648 78014 183648 183648 63766 69529 183648 183648 183648 102013 63766 139449 183648 183648 113280 87363 64479 183648 183648 183648 183648 183648 183648 63766 183648 93281 183648 64068 183648 63690 183648 183648 183648 183648 71353 64068 183648 183648 183648 102334 97824 139630 183648 110843 183648 183648 183648 183648 183648 93281 183648 63766 183648 183648 183648 183648 183648 63766 67946 63766 69529 183648 183648 74700 183648 64292 183648 97584 64890 112995 125717 183648 183648 183648 65927 183648 183648 183648 64366 183648 183648 183648 68450 183648 102334 64068 183648 183648 183648 183648 183648 63690 183648 183648 183648 183648 183648 183648 183648 129534 92829 183648 183648 183648 183648 183648 183648 66719 63690 78014 183648 97719 183648 183648 74700 183648 183648 183648 102334 183648 183648 183648 183648 110843 80622 183648 183648 183648 102334 118096 183648 183648 183648 183648 208852 67073 183648 183648 125996 95715 66971 183648 183648 183648 97660 65663 98873 183648 183648 70581 183648 183648 183648 71353 183648 110843 183648 95715 66971 183648 67073 183648 97660 183648 183648 95715 183648 63690 183648 183648 63690 183648 99530 93281 183648 183648 183648 70654 183648 68437 183648 183648 183648 183648 74700 183648 183648 183648 63690 183648 183648 183648 93281 183648 183648 106973 75457 183648 63690 183648 183648 183648 183648 183648 183648 183648 183648 67438 63690 66087 76115 183648 183648 183648 183648 183648 183648 183648 63766 63766 183648 183648 63690 183648 80719 183648 183648 183648 183648 102013 183648 89431 183648 71932 183648 125996 183648 99901 213612 183648 183648 183648 183648 183648 213612 183648 63766 183648 183648 63766 67946 65113 183648 63690 183648 63690 183648 63766 183648 183648 63766 75094 67438 183648 63766 183648 63766 183648 183648 63856 79145 183648 183648 183648 64068 183648 183648 183648 183648 183648 183648 183648 183648 183648 183648 183648 183648 64649 183648 183648 183648 183648 183648 93281 63766 183648 183648 63689 67438 105276 118096 82908 93281 63766 72124 63702 68692 183648 183648 183648 88510
#! /bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# download train_data
mkdir raw_data
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar
tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar
mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ raw_data/
# preprocess data
python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/word_count_dict.txt --ngrams_path raw_data/word_ngrams.txt
python preprocess.py --filter_corpus --dict_path raw_data/word_count_dict.txt --word_id_path raw_data/word_id_dict.txt --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --ngrams_id_path raw_data/word_ngrams_id.txt --ngrams_path raw_data/word_ngrams.txt --min_count 5 --downsample 0.001
mv raw_data/word_count_dict.txt data/dict/
mv raw_data/word_id_dict.txt data/dict/
mv raw_data/word_ngrams.txt data/dict/
mv raw_data/word_ngrams_id.txt data/dict/
rm -rf data/train/*
rm -rf data/test/*
python preprocess.py --data_resplit --file_nums 24 --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=data/train
# download test data
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
tar xzvf test_dir.tar -C raw_data
mv raw_data/data/test_dir/* data/test/
rm -rf raw_data
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import six
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
dict_path = envs.get_global_env(
"dataset.dataset_infer.word_id_dict_path")
self.min_n = envs.get_global_env("hyper_parameters.min_n")
self.max_n = envs.get_global_env("hyper_parameters.max_n")
self.word_to_id = dict()
self.id_to_word = dict()
with io.open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
self.word_to_id[line.split(' ')[0]] = int(line.split(' ')[1])
self.id_to_word[int(line.split(' ')[1])] = line.split(' ')[0]
self.dict_size = len(self.word_to_id)
def computeSubwords(self, word):
ngrams = set()
for i in range(len(word) - self.min_n + 1):
for j in range(self.min_n, self.max_n + 1):
end = min(len(word), i + j)
ngrams.add("".join(word[i:end]))
return list(ngrams)
def native_to_unicode(self, s):
if self._is_unicode(s):
return s
try:
return self._to_unicode(s)
except UnicodeDecodeError:
res = self._to_unicode(s, ignore_errors=True)
return res
def _is_unicode(self, s):
if six.PY2:
if isinstance(s, unicode):
return True
else:
if isinstance(s, str):
return True
return False
def _to_unicode(self, s, ignore_errors=False):
if self._is_unicode(s):
return s
error_mode = "ignore" if ignore_errors else "strict"
return s.decode("utf-8", errors=error_mode)
def strip_lines(self, line, vocab):
return self._replace_oov(vocab, self.native_to_unicode(line))
def _replace_oov(self, original_vocab, line):
"""Replace out-of-vocab words with "<UNK>".
This maintains compatibility with published results.
Args:
original_vocab: a set of strings (The standard vocabulary for the dataset)
line: a unicode string - a space-delimited sequence of words.
Returns:
a unicode string - a space-delimited sequence of words.
"""
return u" ".join([
"<" + word + ">"
if "<" + word + ">" in original_vocab else u"<UNK>"
for word in line.split()
])
def generate_sample(self, line):
def reader():
if ':' in line:
pass
features = self.strip_lines(line.lower(), self.word_to_id)
features = features.split()
inputs = []
for item in features:
if item == "<UNK>":
inputs.append([self.word_to_id[item]])
else:
ngrams = self.computeSubwords(item)
res = []
res.append(self.word_to_id[item])
for _ in ngrams:
res.append(self.word_to_id[_])
inputs.append(res)
yield [('analogy_a', inputs[0]), ('analogy_b', inputs[1]),
('analogy_c', inputs[2]), ('analogy_d', inputs[3][0:1])]
return reader
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def _init_hyper_parameters(self):
self.is_distributed = True if envs.get_trainer(
) == "CtrTrainer" else False
self.sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number")
self.sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch")
self.learning_rate = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")
self.decay_steps = envs.get_global_env(
"hyper_parameters.optimizer.decay_steps")
self.decay_rate = envs.get_global_env(
"hyper_parameters.optimizer.decay_rate")
def input_data(self, is_infer=False, **kwargs):
if is_infer:
analogy_a = fluid.data(
name="analogy_a", shape=[None, 1], lod_level=1, dtype='int64')
analogy_b = fluid.data(
name="analogy_b", shape=[None, 1], lod_level=1, dtype='int64')
analogy_c = fluid.data(
name="analogy_c", shape=[None, 1], lod_level=1, dtype='int64')
analogy_d = fluid.data(
name="analogy_d", shape=[None, 1], dtype='int64')
return [analogy_a, analogy_b, analogy_c, analogy_d]
input_word = fluid.data(
name="input_word", shape=[None, 1], lod_level=1, dtype='int64')
true_word = fluid.data(
name='true_label', shape=[None, 1], lod_level=1, dtype='int64')
if self.with_shuffle_batch:
return [input_word, true_word]
neg_word = fluid.data(
name="neg_label", shape=[None, self.neg_num], dtype='int64')
return [input_word, true_word, neg_word]
def net(self, inputs, is_infer=False):
if is_infer:
self.infer_net(inputs)
return
def embedding_layer(input,
table_name,
initializer_instance=None,
sequence_pool=False):
emb = fluid.embedding(
input=input,
is_sparse=True,
is_distributed=self.is_distributed,
size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=fluid.ParamAttr(
name=table_name, initializer=initializer_instance), )
if sequence_pool:
emb = fluid.layers.sequence_pool(
input=emb, pool_type='average')
return emb
init_width = 1.0 / self.sparse_feature_dim
emb_initializer = fluid.initializer.Uniform(-init_width, init_width)
emb_w_initializer = fluid.initializer.Constant(value=0.0)
input_emb = embedding_layer(inputs[0], "emb", emb_initializer, True)
input_emb = fluid.layers.squeeze(input=input_emb, axes=[1])
true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer,
True)
true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1])
if self.with_shuffle_batch:
neg_emb_w_list = []
for i in range(self.neg_num):
neg_emb_w_list.append(
fluid.contrib.layers.shuffle_batch(
true_emb_w)) # shuffle true_word
neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
neg_emb_w = fluid.layers.reshape(
neg_emb_w_concat,
shape=[-1, self.neg_num, self.sparse_feature_dim])
else:
neg_emb_w = embedding_layer(inputs[2], "emb_w", emb_w_initializer)
true_logits = fluid.layers.reduce_sum(
fluid.layers.elementwise_mul(input_emb, true_emb_w),
dim=1,
keep_dim=True)
input_emb_re = fluid.layers.reshape(
input_emb, shape=[-1, 1, self.sparse_feature_dim])
neg_matmul = fluid.layers.matmul(
input_emb_re, neg_emb_w, transpose_y=True)
neg_logits = fluid.layers.reshape(neg_matmul, shape=[-1, 1])
logits = fluid.layers.concat([true_logits, neg_logits], axis=0)
label_ones = fluid.layers.fill_constant(
shape=[fluid.layers.shape(true_logits)[0], 1],
value=1.0,
dtype='float32')
label_zeros = fluid.layers.fill_constant(
shape=[fluid.layers.shape(neg_logits)[0], 1],
value=0.0,
dtype='float32')
label = fluid.layers.concat([label_ones, label_zeros], axis=0)
loss = fluid.layers.log_loss(fluid.layers.sigmoid(logits), label)
avg_cost = fluid.layers.reduce_sum(loss)
global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True
self._cost = avg_cost
self._metrics["LOSS"] = avg_cost
def optimizer(self):
optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=self.learning_rate,
decay_steps=self.decay_steps,
decay_rate=self.decay_rate,
staircase=True))
return optimizer
def infer_net(self, inputs):
def embedding_layer(input,
table_name,
initializer_instance=None,
sequence_pool=False):
emb = fluid.embedding(
input=input,
size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=table_name)
if sequence_pool:
emb = fluid.layers.sequence_pool(
input=emb, pool_type='average')
return emb
all_label = np.arange(self.sparse_feature_number).reshape(
self.sparse_feature_number).astype('int32')
self.all_label = fluid.layers.cast(
x=fluid.layers.assign(all_label), dtype='int64')
emb_all_label = embedding_layer(self.all_label, "emb")
emb_a = embedding_layer(inputs[0], "emb", sequence_pool=True)
emb_b = embedding_layer(inputs[1], "emb", sequence_pool=True)
emb_c = embedding_layer(inputs[2], "emb", sequence_pool=True)
target = fluid.layers.elementwise_add(
fluid.layers.elementwise_sub(emb_b, emb_a), emb_c)
emb_all_label_l2 = fluid.layers.l2_normalize(x=emb_all_label, axis=1)
dist = fluid.layers.matmul(
x=target, y=emb_all_label_l2, transpose_y=True)
values, pred_idx = fluid.layers.topk(input=dist, k=4)
label = fluid.layers.expand(inputs[3], expand_times=[1, 4])
label_ones = fluid.layers.fill_constant_batch_size_like(
label, shape=[-1, 1], value=1.0, dtype='float32')
right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
fluid.layers.equal(pred_idx, label), dtype='float32'))
total_cnt = fluid.layers.reduce_sum(label_ones)
global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True
tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt)
fluid.layers.assign(tmp1, global_right_cnt)
tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt)
fluid.layers.assign(tmp2, global_total_cnt)
acc = fluid.layers.elementwise_div(
global_right_cnt, global_total_cnt, name="total_acc")
self._infer_results['acc'] = acc
# -*- coding: utf-8 -*
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import math
import os
import random
import re
import six
import argparse
prog = re.compile("[^a-z ]", flags=0)
def parse_args():
parser = argparse.ArgumentParser(
description="Paddle Fluid word2 vector preprocess")
parser.add_argument(
'--build_dict_corpus_dir', type=str, help="The dir of corpus")
parser.add_argument(
'--input_corpus_dir', type=str, help="The dir of input corpus")
parser.add_argument(
'--output_corpus_dir', type=str, help="The dir of output corpus")
parser.add_argument(
'--dict_path',
type=str,
default='./dict',
help="The path of dictionary ")
parser.add_argument(
'--word_id_path',
type=str,
default='./word_id',
help="The path of word_id ")
parser.add_argument(
'--ngrams_path',
type=str,
default='./word_ngrams',
help="The path of word_ngrams ")
parser.add_argument(
'--ngrams_id_path',
type=str,
default='./word_ngrams_id',
help="The path of word_ngrams_id ")
parser.add_argument(
'--min_count',
type=int,
default=5,
help="If the word count is less then min_count, it will be removed from dict"
)
parser.add_argument('--min_n', type=int, default=3, help="min_n of ngrams")
parser.add_argument('--max_n', type=int, default=5, help="max_n of ngrams")
parser.add_argument(
'--file_nums',
type=int,
default=1024,
help="re-split input corpus file nums")
parser.add_argument(
'--downsample',
type=float,
default=0.001,
help="filter word by downsample")
parser.add_argument(
'--filter_corpus',
action='store_true',
default=False,
help='Filter corpus')
parser.add_argument(
'--build_dict',
action='store_true',
default=False,
help='Build dict from corpus')
parser.add_argument(
'--data_resplit',
action='store_true',
default=False,
help='re-split input corpus files')
return parser.parse_args()
def text_strip(text):
# English Preprocess Rule
return prog.sub("", text.lower())
# Shameless copy from Tensorflow https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/text_encoder.py
# Unicode utility functions that work with Python 2 and 3
def native_to_unicode(s):
if _is_unicode(s):
return s
try:
return _to_unicode(s)
except UnicodeDecodeError:
res = _to_unicode(s, ignore_errors=True)
return res
def _is_unicode(s):
if six.PY2:
if isinstance(s, unicode):
return True
else:
if isinstance(s, str):
return True
return False
def _to_unicode(s, ignore_errors=False):
if _is_unicode(s):
return s
error_mode = "ignore" if ignore_errors else "strict"
return s.decode("utf-8", errors=error_mode)
def filter_corpus(args):
"""
filter corpus and convert id.
"""
word_count = dict()
word_to_id_ = dict()
word_all_count = 0
id_counts = []
word_id = 0
# read dict
with io.open(args.dict_path, 'r', encoding='utf-8') as f:
for line in f:
word, count = line.split()[0], int(line.split()[1])
word_count[word] = count
word_to_id_[word] = word_id
word_id += 1
id_counts.append(count)
word_all_count += count
word_ngrams = dict()
with io.open(args.ngrams_path, 'r', encoding='utf-8') as f:
for line in f:
word, ngrams = line.rstrip().split(':')
ngrams = ngrams.split()
ngrams = [str(word_to_id_[_]) for _ in ngrams]
word_ngrams[word_to_id_[word]] = ' '.join(ngrams)
with io.open(args.ngrams_id_path, 'w+', encoding='utf-8') as fid:
for k, v in word_ngrams.items():
fid.write(u'{} {}\n'.format(k, v))
# write word2id file
print("write word2id file to : " + args.dict_path + "_word_to_id_")
with io.open(args.word_id_path, 'w+', encoding='utf-8') as fid:
for k, v in word_to_id_.items():
fid.write(k + " " + str(v) + '\n')
# filter corpus and convert id
if not os.path.exists(args.output_corpus_dir):
os.makedirs(args.output_corpus_dir)
for file in os.listdir(args.input_corpus_dir):
with io.open(args.output_corpus_dir + '/convert_' + file + '.csv',
"w") as wf:
with io.open(
args.input_corpus_dir + '/' + file,
encoding='utf-8') as rf:
print(args.input_corpus_dir + '/' + file)
for line in rf:
signal = False
line = text_strip(line)
words = line.split()
write_line = ""
for item in words:
if item in word_count:
idx = word_to_id_[item]
else:
idx = word_to_id_[native_to_unicode('<UNK>')]
count_w = id_counts[idx]
corpus_size = word_all_count
keep_prob = (
math.sqrt(count_w /
(args.downsample * corpus_size)) + 1
) * (args.downsample * corpus_size) / count_w
r_value = random.random()
if r_value > keep_prob:
continue
write_line += str(idx)
write_line += " "
signal = True
if signal:
write_line = write_line[:-1] + "\n"
wf.write(_to_unicode(write_line))
def computeSubwords(word, min_n, max_n):
ngrams = set()
for i in range(len(word) - min_n + 1):
for j in range(min_n, max_n + 1):
end = min(len(word), i + j)
ngrams.add("".join(word[i:end]))
return list(ngrams)
def build_dict(args):
"""
proprocess the data, generate dictionary and save into dict_path.
:param corpus_dir: the input data dir.
:param dict_path: the generated dict path. the data in dict is "word count"
:param min_count:
:return:
"""
# word to count
word_count = dict()
for file in os.listdir(args.build_dict_corpus_dir):
with io.open(
args.build_dict_corpus_dir + "/" + file,
encoding='utf-8') as f:
print("build dict : ", args.build_dict_corpus_dir + "/" + file)
for line in f:
line = text_strip(line)
words = line.split()
for item in words:
item = '<' + item + '>'
if item in word_count:
word_count[item] = word_count[item] + 1
else:
word_count[item] = 1
item_to_remove = []
for item in word_count:
if word_count[item] <= args.min_count:
item_to_remove.append(item)
unk_sum = 0
for item in item_to_remove:
unk_sum += word_count[item]
del word_count[item]
# sort by count
word_count[native_to_unicode('<UNK>')] = unk_sum
word_ngrams = dict()
ngrams_count = dict()
for item in word_count:
ngrams = computeSubwords(item, args.min_n, args.max_n)
word_ngrams[item] = ngrams
for sub_word in ngrams:
if sub_word not in ngrams_count:
ngrams_count[sub_word] = 1
else:
ngrams_count[sub_word] = ngrams_count[sub_word] + 1
ngrams_count = sorted(
ngrams_count.items(), key=lambda ngrams_count: -ngrams_count[1])
word_count = sorted(
word_count.items(), key=lambda word_count: -word_count[1])
with io.open(args.dict_path, 'w+', encoding='utf-8') as f:
for k, v in word_count:
f.write(k + " " + str(v) + '\n')
for k, v in ngrams_count:
f.write(k + " " + str(v) + '\n')
with io.open(args.ngrams_path, 'w+', encoding='utf-8') as f:
for key in word_ngrams:
f.write(key + ":")
f.write(" ".join(word_ngrams[key]))
f.write(u'\n')
def data_split(args):
raw_data_dir = args.input_corpus_dir
new_data_dir = args.output_corpus_dir
if not os.path.exists(new_data_dir):
os.mkdir(new_data_dir)
files = os.listdir(raw_data_dir)
print(files)
index = 0
contents = []
for file_ in files:
with open(os.path.join(raw_data_dir, file_), 'r') as f:
contents.extend(f.readlines())
num = int(args.file_nums)
lines_per_file = len(contents) / num
print("contents: ", str(len(contents)))
print("lines_per_file: ", str(lines_per_file))
for i in range(1, num + 1):
with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout:
data = contents[(i - 1) * lines_per_file:min(i * lines_per_file,
len(contents))]
for line in data:
fout.write(line)
if __name__ == "__main__":
args = parse_args()
if args.build_dict:
build_dict(args)
elif args.filter_corpus:
filter_corpus(args)
elif args.data_resplit:
data_split(args)
else:
print(
"error command line, please choose --build_dict or --filter_corpus")
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import numpy as np
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class NumpyRandomInt(object):
def __init__(self, a, b, buf_size=1000):
self.idx = 0
self.buffer = np.random.random_integers(a, b, buf_size)
self.a = a
self.b = b
def __call__(self):
if self.idx == len(self.buffer):
self.buffer = np.random.random_integers(self.a, self.b,
len(self.buffer))
self.idx = 0
result = self.buffer[self.idx]
self.idx += 1
return result
class TrainReader(Reader):
def init(self):
dict_path = envs.get_global_env(
"dataset.dataset_train.word_count_dict_path")
word_ngrams_path = envs.get_global_env(
"dataset.dataset_train.word_ngrams_path")
self.window_size = envs.get_global_env("hyper_parameters.window_size")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch")
self.random_generator = NumpyRandomInt(1, self.window_size + 1)
self.word_ngrams = dict()
with io.open(word_ngrams_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.rstrip().split()
self.word_ngrams[str(line[0])] = map(int, line[1:])
self.cs = None
if not self.with_shuffle_batch:
id_counts = []
word_all_count = 0
with io.open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
word, count = line.split()[0], int(line.split()[1])
id_counts.append(count)
word_all_count += count
id_frequencys = [
float(count) / word_all_count for count in id_counts
]
np_power = np.power(np.array(id_frequencys), 0.75)
id_frequencys_pow = np_power / np_power.sum()
self.cs = np.array(id_frequencys_pow).cumsum()
def get_context_words(self, words, idx):
"""
Get the context word list of target word.
words: the words of the current line
idx: input word index
window_size: window size
"""
target_window = self.random_generator()
start_point = idx - target_window # if (idx - target_window) > 0 else 0
if start_point < 0:
start_point = 0
end_point = idx + target_window
targets = words[start_point:idx] + words[idx + 1:end_point + 1]
return targets
def generate_sample(self, line):
def reader():
word_ids = [w for w in line.split()]
for idx, target_id in enumerate(word_ids):
input_word = [int(target_id)]
if target_id in self.word_ngrams:
input_word += self.word_ngrams[target_id]
context_word_ids = self.get_context_words(word_ids, idx)
for context_id in context_word_ids:
output = [('input_word', input_word),
('true_label', [int(context_id)])]
if not self.with_shuffle_batch:
neg_array = self.cs.searchsorted(
np.random.sample(self.neg_num))
output += [('neg_label',
[int(str(i)) for i in neg_array])]
yield output
return reader
......@@ -11,46 +11,71 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
evaluate:
workspace: "paddlerec.models.recall.gnn"
reader:
batch_size: 50
class: "{workspace}/evaluate_reader.py"
test_data_path: "{workspace}/data/test"
train:
trainer:
# for cluster training
strategy: "async"
# workspace
workspace: "paddlerec.models.recall.gnn"
epochs: 2
workspace: "paddlerec.models.recall.gnn"
# list of dataset
dataset:
- name: dataset_train # name of dataset to distinguish different datasets
batch_size: 100
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/reader.py"
- name: dataset_infer # name
batch_size: 50
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/test"
data_converter: "{workspace}/evaluate_reader.py"
reader:
batch_size: 100
class: "{workspace}/reader.py"
train_data_path: "{workspace}/data/train"
dataset_class: "DataLoader"
# hyper parameters of user-defined network
hyper_parameters:
optimizer:
class: Adam
learning_rate: 0.001
decay_steps: 3
decay_rate: 0.1
l2: 0.00001
sparse_feature_number: 43098
sparse_feature_dim: 100
corpus_size: 719470
gnn_propogation_steps: 1
model:
models: "{workspace}/model.py"
hyper_parameters:
use_DataLoader: True
config_path: "{workspace}/data/config.txt"
sparse_feature_dim: 100
gnn_propogation_steps: 1
learning_rate: 0.001
l2: 0.00001
decay_steps: 3
decay_rate: 0.1
optimizer: adam
# select runner by name
mode: train_runner
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner:
- name: train_runner
class: single_train
# num of epochs
epochs: 2
# device to run training or infer
device: cpu
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path
fetch_period: 10
- name: infer_runner
class: single_infer
# num of epochs
epochs: 1
# device to run training or infer
device: cpu
fetch_period: 1
init_model_path: "increment/0" # load model path
save:
increment:
dirname: "increment"
epoch_interval: 1
save_last: True
inference:
dirname: "inference"
epoch_interval: 1
save_last: True
# runner will run all the phase in each epoch
phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
......@@ -17,7 +17,7 @@
set -e
echo "begin to download data"
cd raw_data && python download.py
cd data && python download.py
mkdir diginetica
python preprocess.py --dataset diginetica
......@@ -26,8 +26,10 @@ python convert_data.py --data_dir diginetica
cat diginetica/train.txt | wc -l >> diginetica/config.txt
mkdir train_data
mv diginetica/train.txt train_data
rm -rf train && mkdir train
mv diginetica/train.txt train
mkdir test_data
mv diginetica/test.txt test_data
rm -rf test && mkdir test
mv diginetica/test.txt test
mv diginetica/config.txt ./config.txt
......@@ -21,10 +21,10 @@ from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class EvaluateReader(Reader):
class TrainReader(Reader):
def init(self):
self.batch_size = envs.get_global_env("batch_size", None,
"evaluate.reader")
self.batch_size = envs.get_global_env(
"dataset.dataset_infer.batch_size")
self.input = []
self.length = None
......
......@@ -25,74 +25,65 @@ from paddlerec.core.model import Model as ModelBase
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
self.init_config()
def init_config(self):
self._fetch_interval = 1
self.items_num, self.ins_num = self.config_read(
envs.get_global_env("hyper_parameters.config_path", None,
self._namespace))
self.train_batch_size = envs.get_global_env("batch_size", None,
"train.reader")
self.evaluate_batch_size = envs.get_global_env("batch_size", None,
"evaluate.reader")
self.hidden_size = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
self.step = envs.get_global_env(
"hyper_parameters.gnn_propogation_steps", None, self._namespace)
def _init_hyper_parameters(self):
self.learning_rate = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")
self.decay_steps = envs.get_global_env(
"hyper_parameters.optimizer.decay_steps")
self.decay_rate = envs.get_global_env(
"hyper_parameters.optimizer.decay_rate")
self.l2 = envs.get_global_env("hyper_parameters.optimizer.l2")
self.dict_size = envs.get_global_env(
"hyper_parameters.sparse_feature_number")
self.corpus_size = envs.get_global_env("hyper_parameters.corpus_size")
def config_read(self, config_path=None):
if config_path is None:
raise ValueError(
"please set train.model.hyper_parameters.config_path at first")
with open(config_path, "r") as fin:
item_nums = int(fin.readline().strip())
ins_nums = int(fin.readline().strip())
return item_nums, ins_nums
self.train_batch_size = envs.get_global_env(
"dataset.dataset_train.batch_size")
self.evaluate_batch_size = envs.get_global_env(
"dataset.dataset_infer.batch_size")
def input(self, bs):
self.items = fluid.data(
self.hidden_size = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.step = envs.get_global_env(
"hyper_parameters.gnn_propogation_steps")
def input_data(self, is_infer=False, **kwargs):
if is_infer:
bs = self.evaluate_batch_size
else:
bs = self.train_batch_size
items = fluid.data(
name="items", shape=[bs, -1],
dtype="int64") # [batch_size, uniq_max]
self.seq_index = fluid.data(
seq_index = fluid.data(
name="seq_index", shape=[bs, -1, 2],
dtype="int32") # [batch_size, seq_max, 2]
self.last_index = fluid.data(
last_index = fluid.data(
name="last_index", shape=[bs, 2], dtype="int32") # [batch_size, 2]
self.adj_in = fluid.data(
adj_in = fluid.data(
name="adj_in", shape=[bs, -1, -1],
dtype="float32") # [batch_size, seq_max, seq_max]
self.adj_out = fluid.data(
adj_out = fluid.data(
name="adj_out", shape=[bs, -1, -1],
dtype="float32") # [batch_size, seq_max, seq_max]
self.mask = fluid.data(
mask = fluid.data(
name="mask", shape=[bs, -1, 1],
dtype="float32") # [batch_size, seq_max, 1]
self.label = fluid.data(
label = fluid.data(
name="label", shape=[bs, 1], dtype="int64") # [batch_size, 1]
res = [
self.items, self.seq_index, self.last_index, self.adj_in,
self.adj_out, self.mask, self.label
]
res = [items, seq_index, last_index, adj_in, adj_out, mask, label]
return res
def train_input(self):
res = self.input(self.train_batch_size)
self._data_var = res
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
False, self._namespace)
def net(self, inputs, is_infer=False):
if is_infer:
bs = self.evaluate_batch_size
else:
bs = self.train_batch_size
if self._platform != "LINUX" or use_dataloader:
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var,
capacity=256,
use_double_buffer=False,
iterable=False)
def net(self, items_num, hidden_size, step, bs):
stdv = 1.0 / math.sqrt(hidden_size)
stdv = 1.0 / math.sqrt(self.hidden_size)
def embedding_layer(input,
table_name,
......@@ -100,22 +91,22 @@ class Model(ModelBase):
initializer_instance=None):
emb = fluid.embedding(
input=input,
size=[items_num, emb_dim],
size=[self.dict_size, emb_dim],
param_attr=fluid.ParamAttr(
name=table_name, initializer=initializer_instance), )
name=table_name, initializer=initializer_instance))
return emb
sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv)
items_emb = embedding_layer(self.items, "emb", hidden_size,
items_emb = embedding_layer(inputs[0], "emb", self.hidden_size,
sparse_initializer)
pre_state = items_emb
for i in range(step):
for i in range(self.step):
pre_state = layers.reshape(
x=pre_state, shape=[bs, -1, hidden_size])
x=pre_state, shape=[bs, -1, self.hidden_size])
state_in = layers.fc(
input=pre_state,
name="state_in",
size=hidden_size,
size=self.hidden_size,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
......@@ -127,7 +118,7 @@ class Model(ModelBase):
state_out = layers.fc(
input=pre_state,
name="state_out",
size=hidden_size,
size=self.hidden_size,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
......@@ -137,33 +128,34 @@ class Model(ModelBase):
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
state_adj_in = layers.matmul(self.adj_in,
state_adj_in = layers.matmul(inputs[3],
state_in) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(
self.adj_out, state_out) # [batch_size, uniq_max, h]
inputs[4], state_out) # [batch_size, uniq_max, h]
gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)
gru_input = layers.reshape(
x=gru_input, shape=[-1, hidden_size * 2])
x=gru_input, shape=[-1, self.hidden_size * 2])
gru_fc = layers.fc(input=gru_input,
name="gru_fc",
size=3 * hidden_size,
size=3 * self.hidden_size,
bias_attr=False)
pre_state, _, _ = fluid.layers.gru_unit(
input=gru_fc,
hidden=layers.reshape(
x=pre_state, shape=[-1, hidden_size]),
size=3 * hidden_size)
x=pre_state, shape=[-1, self.hidden_size]),
size=3 * self.hidden_size)
final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size])
seq = layers.gather_nd(final_state, self.seq_index)
last = layers.gather_nd(final_state, self.last_index)
final_state = layers.reshape(
pre_state, shape=[bs, -1, self.hidden_size])
seq = layers.gather_nd(final_state, inputs[1])
last = layers.gather_nd(final_state, inputs[2])
seq_fc = layers.fc(
input=seq,
name="seq_fc",
size=hidden_size,
size=self.hidden_size,
bias_attr=False,
act=None,
num_flatten_dims=2,
......@@ -171,7 +163,7 @@ class Model(ModelBase):
low=-stdv, high=stdv))) # [batch_size, seq_max, h]
last_fc = layers.fc(input=last,
name="last_fc",
size=hidden_size,
size=self.hidden_size,
bias_attr=False,
act=None,
num_flatten_dims=1,
......@@ -184,7 +176,7 @@ class Model(ModelBase):
add = layers.elementwise_add(seq_fc_t,
last_fc) # [seq_max, batch_size, h]
b = layers.create_parameter(
shape=[hidden_size],
shape=[self.hidden_size],
dtype='float32',
default_initializer=fluid.initializer.Constant(value=0.0)) # [h]
add = layers.elementwise_add(add, b) # [seq_max, batch_size, h]
......@@ -202,7 +194,7 @@ class Model(ModelBase):
bias_attr=False,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, 1]
weight *= self.mask
weight *= inputs[5]
weight_mask = layers.elementwise_mul(
seq, weight, axis=0) # [batch_size, seq_max, h]
global_attention = layers.reduce_sum(
......@@ -213,7 +205,7 @@ class Model(ModelBase):
final_attention_fc = layers.fc(
input=final_attention,
name="final_attention_fc",
size=hidden_size,
size=self.hidden_size,
bias_attr=False,
act=None,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
......@@ -225,7 +217,7 @@ class Model(ModelBase):
# dtype="int64",
# persistable=True,
# name="all_vocab")
all_vocab = np.arange(1, items_num).reshape((-1)).astype('int32')
all_vocab = np.arange(1, self.dict_size).reshape((-1)).astype('int32')
all_vocab = fluid.layers.cast(
x=fluid.layers.assign(all_vocab), dtype='int64')
......@@ -235,63 +227,32 @@ class Model(ModelBase):
name="emb",
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
size=[items_num, hidden_size]) # [all_vocab, h]
size=[self.dict_size, self.hidden_size]) # [all_vocab, h]
logits = layers.matmul(
x=final_attention_fc, y=all_emb,
transpose_y=True) # [batch_size, all_vocab]
softmax = layers.softmax_with_cross_entropy(
logits=logits, label=self.label) # [batch_size, 1]
logits=logits, label=inputs[6]) # [batch_size, 1]
self.loss = layers.reduce_mean(softmax) # [1]
self.acc = layers.accuracy(input=logits, label=self.label, k=20)
self.acc = layers.accuracy(input=logits, label=inputs[6], k=20)
def avg_loss(self):
self._cost = self.loss
if is_infer:
self._infer_results['acc'] = self.acc
self._infer_results['loss'] = self.loss
return
def metrics(self):
self._metrics["LOSS"] = self.loss
self._metrics["train_acc"] = self.acc
def train_net(self):
self.train_input()
self.net(self.items_num, self.hidden_size, self.step,
self.train_batch_size)
self.avg_loss()
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
step_per_epoch = self.ins_num // self.train_batch_size
decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
self._namespace)
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
self._namespace)
l2 = envs.get_global_env("hyper_parameters.l2", None, self._namespace)
step_per_epoch = self.corpus_size // self.train_batch_size
optimizer = fluid.optimizer.Adam(
learning_rate=fluid.layers.exponential_decay(
learning_rate=learning_rate,
decay_steps=decay_steps * step_per_epoch,
decay_rate=decay_rate),
learning_rate=self.learning_rate,
decay_steps=self.decay_steps * step_per_epoch,
decay_rate=self.decay_rate),
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=l2))
regularization_coeff=self.l2))
return optimizer
def infer_input(self):
self._reader_namespace = "evaluate.reader"
res = self.input(self.evaluate_batch_size)
self._infer_data_var = res
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
self.infer_input()
self.net(self.items_num, self.hidden_size, self.step,
self.evaluate_batch_size)
self._infer_results['acc'] = self.acc
self._infer_results['loss'] = self.loss
......@@ -23,9 +23,8 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.batch_size = envs.get_global_env("batch_size", None,
"train.reader")
self.batch_size = envs.get_global_env(
"dataset.dataset_train.batch_size")
self.input = []
self.length = None
......
......@@ -57,8 +57,8 @@
<img align="center" src="../../doc/imgs/gnn.png">
<p>
## 使用教程
### 训练 预测
## 使用教程(快速开始)
###
```shell
python -m paddlerec.run -m paddlerec.models.recall.word2vec # word2vec
python -m paddlerec.run -m paddlerec.models.recall.ssr # ssr
......@@ -67,6 +67,40 @@ python -m paddlerec.run -m paddlerec.models.recall.gnn # gnn
python -m paddlerec.run -m paddlerec.models.recall.ncf # ncf
python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn
```
## 使用教程(复现论文)
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据,并且调整了batch_size等超参以便在样例数据上更加友好的显示训练&测试日志。如果需要复现readme中的效果请按照如下表格调整batch_size等超参,并使用提供的脚本下载对应数据集以及数据预处理。
| 模型 | batch_size | thread_num | epoch_num |
| :---: | :---: | :---: | :---: |
| Word2Vec | 100 | 5 | 5 |
| GNN | 100 | 1 | 30 |
| GRU4REC | 500 | 1 | 10 |
### 数据处理
参考每个模型目录数据下载&预处理脚本。
```bash
sh data_prepare.sh
```
### 训练
```bash
cd modles/recall/gnn # 进入选定好的召回模型的目录 以gnn为例
python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配置文件,使用自定义配置
```
### 预测
```
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer
# 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行:
python -m paddlerec.run -m ./config.yaml # 以gnn为例
```
## 效果对比
### 模型效果列表
......
......@@ -11,51 +11,70 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
evaluate:
workspace: "paddlerec.models.recall.word2vec"
workspace: "paddlerec.models.recall.word2vec"
evaluate_only: False
evaluate_model_path: ""
reader:
batch_size: 50
class: "{workspace}/w2v_evaluate_reader.py"
test_data_path: "{workspace}/data/test"
word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
# list of dataset
dataset:
- name: dataset_train # name of dataset to distinguish different datasets
batch_size: 100
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/train"
word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt"
data_converter: "{workspace}/w2v_reader.py"
- name: dataset_infer # name
batch_size: 50
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/test"
word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
data_converter: "{workspace}/w2v_evaluate_reader.py"
train:
trainer:
# for cluster training
strategy: "async"
hyper_parameters:
optimizer:
learning_rate: 1.0
decay_steps: 100000
decay_rate: 0.999
class: sgd
strategy: async
sparse_feature_number: 354051
sparse_feature_dim: 300
with_shuffle_batch: False
neg_num: 5
window_size: 5
# select runner by name
mode: train_runner
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner:
- name: train_runner
class: single_train
# num of epochs
epochs: 2
workspace: "paddlerec.models.recall.word2vec"
# device to run training or infer
device: cpu
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path
fetch_period: 10
- name: infer_runner
class: single_infer
# num of epochs
epochs: 1
# device to run training or infer
device: cpu
init_model_path: "increment/0" # load model path
reader:
batch_size: 100
class: "{workspace}/w2v_reader.py"
train_data_path: "{workspace}/data/train"
word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt"
model:
models: "{workspace}/model.py"
hyper_parameters:
sparse_feature_number: 85
sparse_feature_dim: 300
with_shuffle_batch: False
neg_num: 5
window_size: 5
learning_rate: 1.0
decay_steps: 100000
decay_rate: 0.999
optimizer: sgd
save:
increment:
dirname: "increment"
epoch_interval: 1
save_last: True
inference:
dirname: "inference"
epoch_interval: 1
save_last: True
# runner will run all the phase in each epoch
phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
......@@ -22,16 +22,17 @@ tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar
mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ raw_data/
# preprocess data
python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/test_build_dict
python preprocess.py --filter_corpus --dict_path raw_data/test_build_dict --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --min_count 5 --downsample 0.001
mkdir thirdparty
mv raw_data/test_build_dict thirdparty/
mv raw_data/test_build_dict_word_to_id_ thirdparty/
python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/word_count_dict.txt
python preprocess.py --filter_corpus --dict_path raw_data/word_count_dict.txt --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --min_count 5 --downsample 0.001
mv raw_data/word_count_dict.txt data/dict/
mv raw_data/word_id_dict.txt data/dict/
python preprocess.py --data_resplit --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=train_data
rm -rf data/train/*
rm -rf data/test/*
python preprocess.py --data_resplit --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=data/train
# download test data
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
tar xzvf test_dir.tar -C raw_data
mv raw_data/data/test_dir test_data/
mv raw_data/data/test_dir/* data/test/
rm -rf raw_data
......@@ -23,45 +23,50 @@ class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def input(self):
neg_num = int(
envs.get_global_env("hyper_parameters.neg_num", None,
self._namespace))
self.input_word = fluid.data(
def _init_hyper_parameters(self):
self.is_distributed = True if envs.get_trainer(
) == "CtrTrainer" else False
self.sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number")
self.sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch")
self.learning_rate = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")
self.decay_steps = envs.get_global_env(
"hyper_parameters.optimizer.decay_steps")
self.decay_rate = envs.get_global_env(
"hyper_parameters.optimizer.decay_rate")
def input_data(self, is_infer=False, **kwargs):
if is_infer:
analogy_a = fluid.data(
name="analogy_a", shape=[None], dtype='int64')
analogy_b = fluid.data(
name="analogy_b", shape=[None], dtype='int64')
analogy_c = fluid.data(
name="analogy_c", shape=[None], dtype='int64')
analogy_d = fluid.data(
name="analogy_d", shape=[None], dtype='int64')
return [analogy_a, analogy_b, analogy_c, analogy_d]
input_word = fluid.data(
name="input_word", shape=[None, 1], dtype='int64')
self.true_word = fluid.data(
true_word = fluid.data(
name='true_label', shape=[None, 1], dtype='int64')
self._data_var.append(self.input_word)
self._data_var.append(self.true_word)
with_shuffle_batch = bool(
int(
envs.get_global_env("hyper_parameters.with_shuffle_batch",
None, self._namespace)))
if not with_shuffle_batch:
self.neg_word = fluid.data(
name="neg_label", shape=[None, neg_num], dtype='int64')
self._data_var.append(self.neg_word)
if self.with_shuffle_batch:
return [input_word, true_word]
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
neg_word = fluid.data(
name="neg_label", shape=[None, self.neg_num], dtype='int64')
return [input_word, true_word, neg_word]
def net(self):
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
neg_num = int(
envs.get_global_env("hyper_parameters.neg_num", None,
self._namespace))
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
with_shuffle_batch = bool(
int(
envs.get_global_env("hyper_parameters.with_shuffle_batch",
None, self._namespace)))
def net(self, inputs, is_infer=False):
if is_infer:
self.infer_net(inputs)
return
def embedding_layer(input,
table_name,
......@@ -71,8 +76,8 @@ class Model(ModelBase):
emb = fluid.embedding(
input=input,
is_sparse=True,
is_distributed=is_distributed,
size=[sparse_feature_number, emb_dim],
is_distributed=self.is_distributed,
size=[self.sparse_feature_number, emb_dim],
param_attr=fluid.ParamAttr(
name=table_name, initializer=initializer_instance), )
if squeeze:
......@@ -80,44 +85,44 @@ class Model(ModelBase):
else:
return emb
init_width = 0.5 / sparse_feature_dim
init_width = 0.5 / self.sparse_feature_dim
emb_initializer = fluid.initializer.Uniform(-init_width, init_width)
emb_w_initializer = fluid.initializer.Constant(value=0.0)
input_emb = embedding_layer(self.input_word, "emb", sparse_feature_dim,
input_emb = embedding_layer(inputs[0], "emb", self.sparse_feature_dim,
emb_initializer, True)
true_emb_w = embedding_layer(self.true_word, "emb_w",
sparse_feature_dim, emb_w_initializer,
True)
true_emb_b = embedding_layer(self.true_word, "emb_b", 1,
true_emb_w = embedding_layer(inputs[1], "emb_w",
self.sparse_feature_dim,
emb_w_initializer, True)
true_emb_b = embedding_layer(inputs[1], "emb_b", 1, emb_w_initializer,
True)
if with_shuffle_batch:
if self.with_shuffle_batch:
neg_emb_w_list = []
for i in range(neg_num):
for i in range(self.neg_num):
neg_emb_w_list.append(
fluid.contrib.layers.shuffle_batch(
true_emb_w)) # shuffle true_word
neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
neg_emb_w = fluid.layers.reshape(
neg_emb_w_concat, shape=[-1, neg_num, sparse_feature_dim])
neg_emb_w_concat,
shape=[-1, self.neg_num, self.sparse_feature_dim])
neg_emb_b_list = []
for i in range(neg_num):
for i in range(self.neg_num):
neg_emb_b_list.append(
fluid.contrib.layers.shuffle_batch(
true_emb_b)) # shuffle true_word
neg_emb_b = fluid.layers.concat(neg_emb_b_list, axis=0)
neg_emb_b_vec = fluid.layers.reshape(
neg_emb_b, shape=[-1, neg_num])
neg_emb_b, shape=[-1, self.neg_num])
else:
neg_emb_w = embedding_layer(self.neg_word, "emb_w",
sparse_feature_dim, emb_w_initializer)
neg_emb_b = embedding_layer(self.neg_word, "emb_b", 1,
neg_emb_w = embedding_layer(
inputs[2], "emb_w", self.sparse_feature_dim, emb_w_initializer)
neg_emb_b = embedding_layer(inputs[2], "emb_b", 1,
emb_w_initializer)
neg_emb_b_vec = fluid.layers.reshape(
neg_emb_b, shape=[-1, neg_num])
neg_emb_b, shape=[-1, self.neg_num])
true_logits = fluid.layers.elementwise_add(
fluid.layers.reduce_sum(
......@@ -127,18 +132,22 @@ class Model(ModelBase):
true_emb_b)
input_emb_re = fluid.layers.reshape(
input_emb, shape=[-1, 1, sparse_feature_dim])
input_emb, shape=[-1, 1, self.sparse_feature_dim])
neg_matmul = fluid.layers.matmul(
input_emb_re, neg_emb_w, transpose_y=True)
neg_logits = fluid.layers.elementwise_add(
fluid.layers.reshape(
neg_matmul, shape=[-1, neg_num]),
neg_emb_b_vec)
label_ones = fluid.layers.fill_constant_batch_size_like(
true_logits, shape=[-1, 1], value=1.0, dtype='float32')
label_zeros = fluid.layers.fill_constant_batch_size_like(
true_logits, shape=[-1, neg_num], value=0.0, dtype='float32')
neg_matmul_re = fluid.layers.reshape(
neg_matmul, shape=[-1, self.neg_num])
neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec)
#nce loss
label_ones = fluid.layers.fill_constant(
shape=[fluid.layers.shape(true_logits)[0], 1],
value=1.0,
dtype='float32')
label_zeros = fluid.layers.fill_constant(
shape=[fluid.layers.shape(true_logits)[0], self.neg_num],
value=0.0,
dtype='float32')
true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits,
label_ones)
......@@ -149,7 +158,9 @@ class Model(ModelBase):
true_xent, dim=1),
fluid.layers.reduce_sum(
neg_xent, dim=1))
self.avg_cost = fluid.layers.reduce_mean(cost)
avg_cost = fluid.layers.reduce_mean(cost)
self._cost = avg_cost
global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt",
persistable=True,
......@@ -164,77 +175,33 @@ class Model(ModelBase):
value=0)
global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True
def avg_loss(self):
self._cost = self.avg_cost
def metrics(self):
self._metrics["LOSS"] = self.avg_cost
def train_net(self):
self.input()
self.net()
self.avg_loss()
self.metrics()
self._metrics["LOSS"] = avg_cost
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
self._namespace)
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
self._namespace)
optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=learning_rate,
decay_steps=decay_steps,
decay_rate=decay_rate,
learning_rate=self.learning_rate,
decay_steps=self.decay_steps,
decay_rate=self.decay_rate,
staircase=True))
return optimizer
def analogy_input(self):
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
self.analogy_a = fluid.data(
name="analogy_a", shape=[None], dtype='int64')
self.analogy_b = fluid.data(
name="analogy_b", shape=[None], dtype='int64')
self.analogy_c = fluid.data(
name="analogy_c", shape=[None], dtype='int64')
self.analogy_d = fluid.data(
name="analogy_d", shape=[None], dtype='int64')
self._infer_data_var = [
self.analogy_a, self.analogy_b, self.analogy_c, self.analogy_d
]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
def infer_net(self, inputs):
def embedding_layer(input, table_name, initializer_instance=None):
emb = fluid.embedding(
input=input,
size=[sparse_feature_number, sparse_feature_dim],
size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=table_name)
return emb
self.analogy_input()
all_label = np.arange(sparse_feature_number).reshape(
sparse_feature_number).astype('int32')
all_label = np.arange(self.sparse_feature_number).reshape(
self.sparse_feature_number).astype('int32')
self.all_label = fluid.layers.cast(
x=fluid.layers.assign(all_label), dtype='int64')
emb_all_label = embedding_layer(self.all_label, "emb")
emb_a = embedding_layer(self.analogy_a, "emb")
emb_b = embedding_layer(self.analogy_b, "emb")
emb_c = embedding_layer(self.analogy_c, "emb")
emb_a = embedding_layer(inputs[0], "emb")
emb_b = embedding_layer(inputs[1], "emb")
emb_c = embedding_layer(inputs[2], "emb")
target = fluid.layers.elementwise_add(
fluid.layers.elementwise_sub(emb_b, emb_a), emb_c)
......@@ -245,8 +212,7 @@ class Model(ModelBase):
values, pred_idx = fluid.layers.topk(input=dist, k=4)
label = fluid.layers.expand(
fluid.layers.unsqueeze(
self.analogy_d, axes=[1]),
expand_times=[1, 4])
inputs[3], axes=[1]), expand_times=[1, 4])
label_ones = fluid.layers.fill_constant_batch_size_like(
label, shape=[-1, 1], value=1.0, dtype='float32')
right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
......
......@@ -162,7 +162,7 @@ def filter_corpus(args):
if r_value > keep_prob:
continue
write_line += str(idx)
write_line += ","
write_line += " "
signal = True
if signal:
write_line = write_line[:-1] + "\n"
......
......@@ -20,10 +20,10 @@ from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class EvaluateReader(Reader):
class TrainReader(Reader):
def init(self):
dict_path = envs.get_global_env("word_id_dict_path", None,
"evaluate.reader")
dict_path = envs.get_global_env(
"dataset.dataset_infer.word_id_dict_path")
self.word_to_id = dict()
self.id_to_word = dict()
with io.open(dict_path, 'r', encoding='utf-8') as f:
......@@ -75,6 +75,8 @@ class EvaluateReader(Reader):
def generate_sample(self, line):
def reader():
if ':' in line:
pass
features = self.strip_lines(line.lower(), self.word_to_id)
features = features.split()
yield [('analogy_a', [self.word_to_id[features[0]]]),
......
......@@ -40,14 +40,12 @@ class NumpyRandomInt(object):
class TrainReader(Reader):
def init(self):
dict_path = envs.get_global_env("word_count_dict_path", None,
"train.reader")
self.window_size = envs.get_global_env("hyper_parameters.window_size",
None, "train.model")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None,
"train.model")
dict_path = envs.get_global_env(
"dataset.dataset_train.word_count_dict_path")
self.window_size = envs.get_global_env("hyper_parameters.window_size")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch", None, "train.model")
"hyper_parameters.with_shuffle_batch")
self.random_generator = NumpyRandomInt(1, self.window_size + 1)
self.cs = None
......
......@@ -63,7 +63,7 @@ def build(dirname):
models_copy = [
'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy',
'tree/*.txt', 'data/sample_data/*', 'data/sample_data/train/*',
'data/sample_data/infer/*'
'data/sample_data/infer/*', 'data/*/*.csv'
]
engine_copy = ['*/*.sh']
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册