diff --git a/core/model.py b/core/model.py
index 9422dc49ea9492ff958106a1a51ea11f318bd148..8472d9924a372d14c4c74fd680915d86db7eb324 100755
--- a/core/model.py
+++ b/core/model.py
@@ -149,11 +149,13 @@ class Model(object):
return optimizer_i
def optimizer(self):
- learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
- None, self._namespace)
- optimizer = envs.get_global_env("hyper_parameters.optimizer", None,
- self._namespace)
- return self._build_optimizer(optimizer, learning_rate)
+ opt_name = envs.get_global_env("hyper_parameters.optimizer.class")
+ opt_lr = envs.get_global_env(
+ "hyper_parameters.optimizer.learning_rate")
+ opt_strategy = envs.get_global_env(
+ "hyper_parameters.optimizer.strategy")
+
+ return self._build_optimizer(opt_name, opt_lr, opt_strategy)
def input_data(self, is_infer=False, **kwargs):
name = "dataset." + kwargs.get("dataset_name") + "."
diff --git a/core/trainers/single_infer.py b/core/trainers/single_infer.py
index 0f1c92f3f2d948c76d4cd2b0fdcca131b99cfc92..d54e418c2a36d96f94ec39e53fc11c19e43d3f06 100755
--- a/core/trainers/single_infer.py
+++ b/core/trainers/single_infer.py
@@ -167,6 +167,7 @@ class SingleInfer(TranspileTrainer):
model = envs.lazy_instance_by_fliename(
model_path, "Model")(self._env)
model._infer_data_var = model.input_data(
+ is_infer=True,
dataset_name=model_dict["dataset_name"])
if envs.get_global_env("dataset." + dataset_name +
".type") == "DataLoader":
diff --git a/core/trainers/single_trainer.py b/core/trainers/single_trainer.py
index 2f8fda74b765e2b26cf81d68af7bf07f17432e3f..21890857a007925c7e759a6165934b9ad838bcae 100755
--- a/core/trainers/single_trainer.py
+++ b/core/trainers/single_trainer.py
@@ -147,11 +147,6 @@ class SingleTrainer(TranspileTrainer):
startup_program = fluid.Program()
scope = fluid.Scope()
dataset_name = model_dict["dataset_name"]
- opt_name = envs.get_global_env("hyper_parameters.optimizer.class")
- opt_lr = envs.get_global_env(
- "hyper_parameters.optimizer.learning_rate")
- opt_strategy = envs.get_global_env(
- "hyper_parameters.optimizer.strategy")
with fluid.program_guard(train_program, startup_program):
with fluid.unique_name.guard():
with fluid.scope_guard(scope):
@@ -168,8 +163,7 @@ class SingleTrainer(TranspileTrainer):
self._get_dataloader(dataset_name,
model._data_loader)
model.net(model._data_var, False)
- optimizer = model._build_optimizer(opt_name, opt_lr,
- opt_strategy)
+ optimizer = model.optimizer()
optimizer.minimize(model._cost)
self._model[model_dict["name"]][0] = train_program
self._model[model_dict["name"]][1] = startup_program
diff --git a/models/match/dssm/config.yaml b/models/match/dssm/config.yaml
index 22881bdf906178a87f4b89091589523533eac934..536b0948296638a01a3d185e88e2be827b3ad063 100755
--- a/models/match/dssm/config.yaml
+++ b/models/match/dssm/config.yaml
@@ -11,44 +11,66 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-evaluate:
- reader:
- batch_size: 1
- class: "{workspace}/synthetic_evaluate_reader.py"
- test_data_path: "{workspace}/data/train"
-
-train:
- trainer:
- # for cluster training
- strategy: "async"
- epochs: 4
- workspace: "paddlerec.models.match.dssm"
- reader:
- batch_size: 4
- class: "{workspace}/synthetic_reader.py"
- train_data_path: "{workspace}/data/train"
+workspace: "paddlerec.models.match.dssm"
+
+dataset:
+- name: dataset_train
+ batch_size: 4
+ type: QueueDataset
+ data_path: "{workspace}/data/train"
+ data_converter: "{workspace}/synthetic_reader.py"
+- name: dataset_infer
+ batch_size: 1
+ type: QueueDataset
+ data_path: "{workspace}/data/train"
+ data_converter: "{workspace}/synthetic_evaluate_reader.py"
- model:
- models: "{workspace}/model.py"
- hyper_parameters:
- TRIGRAM_D: 1000
- NEG: 4
- fc_sizes: [300, 300, 128]
- fc_acts: ['tanh', 'tanh', 'tanh']
- learning_rate: 0.01
- optimizer: sgd
+hyper_parameters:
+ optimizer:
+ class: sgd
+ learning_rate: 0.01
+ strategy: async
+ trigram_d: 1000
+ neg_num: 4
+ fc_sizes: [300, 300, 128]
+ fc_acts: ['tanh', 'tanh', 'tanh']
- save:
- increment:
- dirname: "increment"
- epoch_interval: 2
- save_last: True
+mode: train_runner
+# config of each runner.
+# runner is a kind of paddle training class, which wraps the train/infer process.
+runner:
+- name: train_runner
+ class: single_train
+ # num of epochs
+ epochs: 4
+ # device to run training or infer
+ device: cpu
+ save_checkpoint_interval: 2 # save model interval of epochs
+ save_inference_interval: 4 # save inference
+ save_checkpoint_path: "increment" # save checkpoint path
+ save_inference_path: "inference" # save inference path
+ save_inference_feed_varnames: ["query", "doc_pos"] # feed vars of save inference
+ save_inference_fetch_varnames: ["cos_sim_0.tmp_0"] # fetch vars of save inference
+ init_model_path: "" # load model path
+ fetch_period: 2
+- name: infer_runner
+ class: single_infer
+ # num of epochs
+ epochs: 1
+ # device to run training or infer
+ device: cpu
+ fetch_period: 1
+ init_model_path: "increment/2" # load model path
- inference:
- dirname: "inference"
- epoch_interval: 4
- feed_varnames: ["query", "doc_pos"]
- fetch_varnames: ["cos_sim_0.tmp_0"]
- save_last: True
+# runner will run all the phase in each epoch
+phase:
+- name: phase1
+ model: "{workspace}/model.py" # user-defined model
+ dataset_name: dataset_train # select dataset by name
+ thread_num: 1
+#- name: phase2
+# model: "{workspace}/model.py" # user-defined model
+# dataset_name: dataset_infer # select dataset by name
+# thread_num: 1
diff --git a/models/match/dssm/model.py b/models/match/dssm/model.py
index 05d6f762cb266b4cbe40c9a972aafe1885af5b86..1b0520e18ed4f32ddc7c81b1b7b6287858995913 100755
--- a/models/match/dssm/model.py
+++ b/models/match/dssm/model.py
@@ -22,45 +22,39 @@ class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
- def input(self):
- TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
- self._namespace)
-
- Neg = envs.get_global_env("hyper_parameters.NEG", None,
- self._namespace)
-
- self.query = fluid.data(
- name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
- self.doc_pos = fluid.data(
+ def _init_hyper_parameters(self):
+ self.trigram_d = envs.get_global_env("hyper_parameters.trigram_d")
+ self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
+ self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes")
+ self.hidden_acts = envs.get_global_env("hyper_parameters.fc_acts")
+ self.learning_rate = envs.get_global_env(
+ "hyper_parameters.learning_rate")
+
+ def input_data(self, is_infer=False, **kwargs):
+ query = fluid.data(
+ name="query",
+ shape=[-1, self.trigram_d],
+ dtype='float32',
+ lod_level=0)
+ doc_pos = fluid.data(
name="doc_pos",
- shape=[-1, TRIGRAM_D],
+ shape=[-1, self.trigram_d],
dtype='float32',
lod_level=0)
- self.doc_negs = [
+
+ if is_infer:
+ return [query, doc_pos]
+
+ doc_negs = [
fluid.data(
name="doc_neg_" + str(i),
- shape=[-1, TRIGRAM_D],
+ shape=[-1, self.trigram_d],
dtype="float32",
- lod_level=0) for i in range(Neg)
+ lod_level=0) for i in range(self.neg_num)
]
- self._data_var.append(self.query)
- self._data_var.append(self.doc_pos)
- for input in self.doc_negs:
- self._data_var.append(input)
-
- if self._platform != "LINUX":
- self._data_loader = fluid.io.DataLoader.from_generator(
- feed_list=self._data_var,
- capacity=64,
- use_double_buffer=False,
- iterable=False)
-
- def net(self, is_infer=False):
- hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None,
- self._namespace)
- hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None,
- self._namespace)
+ return [query, doc_pos] + doc_negs
+ def net(self, inputs, is_infer=False):
def fc(data, hidden_layers, hidden_acts, names):
fc_inputs = [data]
for i in range(len(hidden_layers)):
@@ -77,71 +71,30 @@ class Model(ModelBase):
fc_inputs.append(out)
return fc_inputs[-1]
- query_fc = fc(self.query, hidden_layers, hidden_acts,
+ query_fc = fc(inputs[0], self.hidden_layers, self.hidden_acts,
['query_l1', 'query_l2', 'query_l3'])
- doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts,
+ doc_pos_fc = fc(inputs[1], self.hidden_layers, self.hidden_acts,
['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3'])
- self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc)
+ R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc)
if is_infer:
+ self._infer_results["query_doc_sim"] = R_Q_D_p
return
R_Q_D_ns = []
- for i, doc_neg in enumerate(self.doc_negs):
- doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, [
- 'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
- 'doc_neg_l3_' + str(i)
- ])
+ for i in range(len(inputs) - 2):
+ doc_neg_fc_i = fc(
+ inputs[i + 2], self.hidden_layers, self.hidden_acts, [
+ 'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
+ 'doc_neg_l3_' + str(i)
+ ])
R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i))
- concat_Rs = fluid.layers.concat(
- input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1)
+ concat_Rs = fluid.layers.concat(input=[R_Q_D_p] + R_Q_D_ns, axis=-1)
prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice(
prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
- self.avg_cost = fluid.layers.mean(x=loss)
-
- def infer_results(self):
- self._infer_results['query_doc_sim'] = self.R_Q_D_p
-
- def avg_loss(self):
- self._cost = self.avg_cost
-
- def metrics(self):
- self._metrics["LOSS"] = self.avg_cost
-
- def train_net(self):
- self.input()
- self.net(is_infer=False)
- self.avg_loss()
- self.metrics()
-
- def optimizer(self):
- learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
- None, self._namespace)
- optimizer = fluid.optimizer.SGD(learning_rate)
- return optimizer
-
- def infer_input(self):
- TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
- self._namespace)
- self.query = fluid.data(
- name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
- self.doc_pos = fluid.data(
- name="doc_pos",
- shape=[-1, TRIGRAM_D],
- dtype='float32',
- lod_level=0)
- self._infer_data_var = [self.query, self.doc_pos]
-
- self._infer_data_loader = fluid.io.DataLoader.from_generator(
- feed_list=self._infer_data_var,
- capacity=64,
- use_double_buffer=False,
- iterable=False)
-
- def infer_net(self):
- self.infer_input()
- self.net(is_infer=True)
- self.infer_results()
+ avg_cost = fluid.layers.mean(x=loss)
+ self._cost = avg_cost
+ self._metrics["LOSS"] = avg_cost
diff --git a/models/match/dssm/synthetic_evaluate_reader.py b/models/match/dssm/synthetic_evaluate_reader.py
index 97f50abf9720060b008b90c7729e93d13701bb3b..5ee894fd379e87d3c7a7dcfa15c533230f3e2529 100755
--- a/models/match/dssm/synthetic_evaluate_reader.py
+++ b/models/match/dssm/synthetic_evaluate_reader.py
@@ -16,7 +16,7 @@ from __future__ import print_function
from paddlerec.core.reader import Reader
-class EvaluateReader(Reader):
+class TrainReader(Reader):
def init(self):
pass
diff --git a/models/match/multiview-simnet/config.yaml b/models/match/multiview-simnet/config.yaml
index 53ac4c095c0d347cca8cba1afb9866c66ab85218..276ddae156b36b27a46bd370bb79161575f0c558 100755
--- a/models/match/multiview-simnet/config.yaml
+++ b/models/match/multiview-simnet/config.yaml
@@ -11,49 +11,73 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-evaluate:
- workspace: "paddlerec.models.match.multiview-simnet"
- reader:
- batch_size: 2
- class: "{workspace}/evaluate_reader.py"
- test_data_path: "{workspace}/data/test"
-train:
- trainer:
- # for cluster training
- strategy: "async"
+# workspace
+workspace: "paddlerec.models.match.multiview-simnet"
- epochs: 2
- workspace: "paddlerec.models.match.multiview-simnet"
+# list of dataset
+dataset:
+- name: dataset_train # name of dataset to distinguish different datasets
+ batch_size: 2
+ type: DataLoader # or QueueDataset
+ data_path: "{workspace}/data/train"
+ sparse_slots: "1 2 3"
+- name: dataset_infer # name
+ batch_size: 2
+ type: DataLoader # or QueueDataset
+ data_path: "{workspace}/data/test"
+ sparse_slots: "1 2"
- reader:
- batch_size: 2
- class: "{workspace}/reader.py"
- train_data_path: "{workspace}/data/train"
- dataset_class: "DataLoader"
+# hyper parameters of user-defined network
+hyper_parameters:
+ optimizer:
+ class: Adam
+ learning_rate: 0.0001
+ strategy: async
+ query_encoder: "bow"
+ title_encoder: "bow"
+ query_encode_dim: 128
+ title_encode_dim: 128
+ sparse_feature_dim: 1000001
+ embedding_dim: 128
+ hidden_size: 128
+ margin: 0.1
- model:
- models: "{workspace}/model.py"
- hyper_parameters:
- use_DataLoader: True
- query_encoder: "bow"
- title_encoder: "bow"
- query_encode_dim: 128
- title_encode_dim: 128
- query_slots: 1
- title_slots: 1
- sparse_feature_dim: 1000001
- embedding_dim: 128
- hidden_size: 128
- learning_rate: 0.0001
- optimizer: adam
+# select runner by name
+mode: train_runner
+# config of each runner.
+# runner is a kind of paddle training class, which wraps the train/infer process.
+runner:
+- name: train_runner
+ class: single_train
+ # num of epochs
+ epochs: 2
+ # device to run training or infer
+ device: cpu
+ save_checkpoint_interval: 1 # save model interval of epochs
+ save_inference_interval: 1 # save inference
+ save_checkpoint_path: "increment" # save checkpoint path
+ save_inference_path: "inference" # save inference path
+ save_inference_feed_varnames: [] # feed vars of save inference
+ save_inference_fetch_varnames: [] # fetch vars of save inference
+ init_model_path: "" # load model path
+ fetch_period: 1
+- name: infer_runner
+ class: single_infer
+ # num of epochs
+ epochs: 1
+ # device to run training or infer
+ device: cpu
+ fetch_period: 1
+ init_model_path: "increment/0" # load model path
- save:
- increment:
- dirname: "increment"
- epoch_interval: 1
- save_last: True
- inference:
- dirname: "inference"
- epoch_interval: 1
- save_last: True
+# runner will run all the phase in each epoch
+phase:
+- name: phase1
+ model: "{workspace}/model.py" # user-defined model
+ dataset_name: dataset_train # select dataset by name
+ thread_num: 1
+#- name: phase2
+# model: "{workspace}/model.py" # user-defined model
+# dataset_name: dataset_infer # select dataset by name
+# thread_num: 1
diff --git a/models/match/multiview-simnet/model.py b/models/match/multiview-simnet/model.py
index f80a1cd0390f3c7aafc772ef535eb36b9657b439..608446f97281f3b9fd46fe272e307d7ae9335f13 100755
--- a/models/match/multiview-simnet/model.py
+++ b/models/match/multiview-simnet/model.py
@@ -99,143 +99,89 @@ class SimpleEncoderFactory(object):
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
- self.init_config()
-
- def init_config(self):
- self._fetch_interval = 1
- query_encoder = envs.get_global_env("hyper_parameters.query_encoder",
- None, self._namespace)
- title_encoder = envs.get_global_env("hyper_parameters.title_encoder",
- None, self._namespace)
- query_encode_dim = envs.get_global_env(
- "hyper_parameters.query_encode_dim", None, self._namespace)
- title_encode_dim = envs.get_global_env(
- "hyper_parameters.title_encode_dim", None, self._namespace)
- query_slots = envs.get_global_env("hyper_parameters.query_slots", None,
- self._namespace)
- title_slots = envs.get_global_env("hyper_parameters.title_slots", None,
- self._namespace)
- factory = SimpleEncoderFactory()
- self.query_encoders = [
- factory.create(query_encoder, query_encode_dim)
- for i in range(query_slots)
- ]
- self.title_encoders = [
- factory.create(title_encoder, title_encode_dim)
- for i in range(title_slots)
- ]
+
+ def _init_hyper_parameters(self):
+ self.query_encoder = envs.get_global_env(
+ "hyper_parameters.query_encoder")
+ self.title_encoder = envs.get_global_env(
+ "hyper_parameters.title_encoder")
+ self.query_encode_dim = envs.get_global_env(
+ "hyper_parameters.query_encode_dim")
+ self.title_encode_dim = envs.get_global_env(
+ "hyper_parameters.title_encode_dim")
self.emb_size = envs.get_global_env(
- "hyper_parameters.sparse_feature_dim", None, self._namespace)
- self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim",
- None, self._namespace)
+ "hyper_parameters.sparse_feature_dim")
+ self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim")
self.emb_shape = [self.emb_size, self.emb_dim]
- self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size",
- None, self._namespace)
- self.margin = 0.1
-
- def input(self, is_train=True):
- self.q_slots = [
- fluid.data(
- name="%d" % i, shape=[None, 1], lod_level=1, dtype='int64')
- for i in range(len(self.query_encoders))
- ]
- self.pt_slots = [
- fluid.data(
- name="%d" % (i + len(self.query_encoders)),
- shape=[None, 1],
- lod_level=1,
- dtype='int64') for i in range(len(self.title_encoders))
- ]
- if is_train == False:
- return self.q_slots + self.pt_slots
+ self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size")
+ self.margin = envs.get_global_env("hyper_parameters.margin")
- self.nt_slots = [
- fluid.data(
- name="%d" %
- (i + len(self.query_encoders) + len(self.title_encoders)),
- shape=[None, 1],
- lod_level=1,
- dtype='int64') for i in range(len(self.title_encoders))
+ def net(self, input, is_infer=False):
+ factory = SimpleEncoderFactory()
+ self.q_slots = self._sparse_data_var[0:1]
+ self.query_encoders = [
+ factory.create(self.query_encoder, self.query_encode_dim)
+ for _ in self.q_slots
]
-
- return self.q_slots + self.pt_slots + self.nt_slots
-
- def train_input(self):
- res = self.input()
- self._data_var = res
-
- use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
- False, self._namespace)
-
- if self._platform != "LINUX" or use_dataloader:
- self._data_loader = fluid.io.DataLoader.from_generator(
- feed_list=self._data_var,
- capacity=256,
- use_double_buffer=False,
- iterable=False)
-
- def get_acc(self, x, y):
- less = tensor.cast(cf.less_than(x, y), dtype='float32')
- label_ones = fluid.layers.fill_constant_batch_size_like(
- input=x, dtype='float32', shape=[-1, 1], value=1.0)
- correct = fluid.layers.reduce_sum(less)
- total = fluid.layers.reduce_sum(label_ones)
- acc = fluid.layers.elementwise_div(correct, total)
- return acc
-
- def net(self):
q_embs = [
fluid.embedding(
input=query, size=self.emb_shape, param_attr="emb")
for query in self.q_slots
]
- pt_embs = [
- fluid.embedding(
- input=title, size=self.emb_shape, param_attr="emb")
- for title in self.pt_slots
- ]
- nt_embs = [
- fluid.embedding(
- input=title, size=self.emb_shape, param_attr="emb")
- for title in self.nt_slots
- ]
-
# encode each embedding field with encoder
q_encodes = [
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
]
- pt_encodes = [
- self.title_encoders[i].forward(emb)
- for i, emb in enumerate(pt_embs)
- ]
- nt_encodes = [
- self.title_encoders[i].forward(emb)
- for i, emb in enumerate(nt_embs)
- ]
-
# concat multi view for query, pos_title, neg_title
q_concat = fluid.layers.concat(q_encodes)
- pt_concat = fluid.layers.concat(pt_encodes)
- nt_concat = fluid.layers.concat(nt_encodes)
-
# projection of hidden layer
q_hid = fluid.layers.fc(q_concat,
size=self.hidden_size,
param_attr='q_fc.w',
bias_attr='q_fc.b')
+
+ self.pt_slots = self._sparse_data_var[1:2]
+ self.title_encoders = [
+ factory.create(self.title_encoder, self.title_encode_dim)
+ ]
+ pt_embs = [
+ fluid.embedding(
+ input=title, size=self.emb_shape, param_attr="emb")
+ for title in self.pt_slots
+ ]
+ pt_encodes = [
+ self.title_encoders[i].forward(emb)
+ for i, emb in enumerate(pt_embs)
+ ]
+ pt_concat = fluid.layers.concat(pt_encodes)
pt_hid = fluid.layers.fc(pt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
+ # cosine of hidden layers
+ cos_pos = fluid.layers.cos_sim(q_hid, pt_hid)
+
+ if is_infer:
+ self._infer_results['query_pt_sim'] = cos_pos
+ return
+
+ self.nt_slots = self._sparse_data_var[2:3]
+ nt_embs = [
+ fluid.embedding(
+ input=title, size=self.emb_shape, param_attr="emb")
+ for title in self.nt_slots
+ ]
+ nt_encodes = [
+ self.title_encoders[i].forward(emb)
+ for i, emb in enumerate(nt_embs)
+ ]
+ nt_concat = fluid.layers.concat(nt_encodes)
nt_hid = fluid.layers.fc(nt_concat,
size=self.hidden_size,
param_attr='t_fc.w',
bias_attr='t_fc.b')
-
- # cosine of hidden layers
- cos_pos = fluid.layers.cos_sim(q_hid, pt_hid)
cos_neg = fluid.layers.cos_sim(q_hid, nt_hid)
# pairwise hinge_loss
@@ -254,72 +200,16 @@ class Model(ModelBase):
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
- self.avg_cost = fluid.layers.mean(loss_part3)
+ self._cost = fluid.layers.mean(loss_part3)
self.acc = self.get_acc(cos_neg, cos_pos)
-
- def avg_loss(self):
- self._cost = self.avg_cost
-
- def metrics(self):
- self._metrics["loss"] = self.avg_cost
+ self._metrics["loss"] = self._cost
self._metrics["acc"] = self.acc
- def train_net(self):
- self.train_input()
- self.net()
- self.avg_loss()
- self.metrics()
-
- def optimizer(self):
- learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
- None, self._namespace)
- optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
- return optimizer
-
- def infer_input(self):
- res = self.input(is_train=False)
- self._infer_data_var = res
-
- self._infer_data_loader = fluid.io.DataLoader.from_generator(
- feed_list=self._infer_data_var,
- capacity=64,
- use_double_buffer=False,
- iterable=False)
-
- def infer_net(self):
- self.infer_input()
- # lookup embedding for each slot
- q_embs = [
- fluid.embedding(
- input=query, size=self.emb_shape, param_attr="emb")
- for query in self.q_slots
- ]
- pt_embs = [
- fluid.embedding(
- input=title, size=self.emb_shape, param_attr="emb")
- for title in self.pt_slots
- ]
- # encode each embedding field with encoder
- q_encodes = [
- self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
- ]
- pt_encodes = [
- self.title_encoders[i].forward(emb)
- for i, emb in enumerate(pt_embs)
- ]
- # concat multi view for query, pos_title, neg_title
- q_concat = fluid.layers.concat(q_encodes)
- pt_concat = fluid.layers.concat(pt_encodes)
- # projection of hidden layer
- q_hid = fluid.layers.fc(q_concat,
- size=self.hidden_size,
- param_attr='q_fc.w',
- bias_attr='q_fc.b')
- pt_hid = fluid.layers.fc(pt_concat,
- size=self.hidden_size,
- param_attr='t_fc.w',
- bias_attr='t_fc.b')
-
- # cosine of hidden layers
- cos = fluid.layers.cos_sim(q_hid, pt_hid)
- self._infer_results['query_pt_sim'] = cos
+ def get_acc(self, x, y):
+ less = tensor.cast(cf.less_than(x, y), dtype='float32')
+ label_ones = fluid.layers.fill_constant_batch_size_like(
+ input=x, dtype='float32', shape=[-1, 1], value=1.0)
+ correct = fluid.layers.reduce_sum(less)
+ total = fluid.layers.reduce_sum(label_ones)
+ acc = fluid.layers.elementwise_div(correct, total)
+ return acc
diff --git a/models/match/readme.md b/models/match/readme.md
index d9f91b257d81ffde820a04cad49b56edbd903f6a..5599dfbfcb7638e50d916b7014ed742307dc9717 100755
--- a/models/match/readme.md
+++ b/models/match/readme.md
@@ -31,9 +31,21 @@
-## 使用教程
-### 训练&预测
+## 使用教程(快速开始)
+### 训练
```shell
python -m paddlerec.run -m paddlerec.models.match.dssm # dssm
python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet
```
+
+### 预测
+```shell
+# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
+# 修改对应模型的config.yaml,mode配置infer_runner
+# 示例: mode: train_runner -> mode: infer_runner
+# infer_runner中 class配置为 class: single_infer
+# 修改phase阶段为infer的配置,参照config注释
+
+# 修改完config.yaml后 执行:
+python -m paddlerec.run -m ./config.yaml # 以dssm为例
+```
diff --git a/models/recall/gnn/config.yaml b/models/recall/gnn/config.yaml
index 50c6d401153a607a88d5eba713fc439303aad868..f4d5b1efdd213c7ef14af170ab124426b5bc14c6 100755
--- a/models/recall/gnn/config.yaml
+++ b/models/recall/gnn/config.yaml
@@ -11,46 +11,71 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-evaluate:
- workspace: "paddlerec.models.recall.gnn"
- reader:
- batch_size: 50
- class: "{workspace}/evaluate_reader.py"
- test_data_path: "{workspace}/data/test"
-train:
- trainer:
- # for cluster training
- strategy: "async"
+# workspace
+workspace: "paddlerec.models.recall.gnn"
- epochs: 2
- workspace: "paddlerec.models.recall.gnn"
+# list of dataset
+dataset:
+- name: dataset_train # name of dataset to distinguish different datasets
+ batch_size: 100
+ type: DataLoader # or QueueDataset
+ data_path: "{workspace}/data/train"
+ data_converter: "{workspace}/reader.py"
+- name: dataset_infer # name
+ batch_size: 50
+ type: DataLoader # or QueueDataset
+ data_path: "{workspace}/data/test"
+ data_converter: "{workspace}/evaluate_reader.py"
- reader:
- batch_size: 100
- class: "{workspace}/reader.py"
- train_data_path: "{workspace}/data/train"
- dataset_class: "DataLoader"
+# hyper parameters of user-defined network
+hyper_parameters:
+ optimizer:
+ class: Adam
+ learning_rate: 0.001
+ decay_steps: 3
+ decay_rate: 0.1
+ l2: 0.00001
+ sparse_feature_number: 43098
+ sparse_feature_dim: 100
+ corpus_size: 719470
+ gnn_propogation_steps: 1
- model:
- models: "{workspace}/model.py"
- hyper_parameters:
- use_DataLoader: True
- config_path: "{workspace}/data/config.txt"
- sparse_feature_dim: 100
- gnn_propogation_steps: 1
- learning_rate: 0.001
- l2: 0.00001
- decay_steps: 3
- decay_rate: 0.1
- optimizer: adam
+# select runner by name
+mode: train_runner
+# config of each runner.
+# runner is a kind of paddle training class, which wraps the train/infer process.
+runner:
+- name: train_runner
+ class: single_train
+ # num of epochs
+ epochs: 2
+ # device to run training or infer
+ device: cpu
+ save_checkpoint_interval: 1 # save model interval of epochs
+ save_inference_interval: 1 # save inference
+ save_checkpoint_path: "increment" # save checkpoint path
+ save_inference_path: "inference" # save inference path
+ save_inference_feed_varnames: [] # feed vars of save inference
+ save_inference_fetch_varnames: [] # fetch vars of save inference
+ init_model_path: "" # load model path
+ fetch_period: 10
+- name: infer_runner
+ class: single_infer
+ # num of epochs
+ epochs: 1
+ # device to run training or infer
+ device: cpu
+ fetch_period: 1
+ init_model_path: "increment/0" # load model path
- save:
- increment:
- dirname: "increment"
- epoch_interval: 1
- save_last: True
- inference:
- dirname: "inference"
- epoch_interval: 1
- save_last: True
+# runner will run all the phase in each epoch
+phase:
+- name: phase1
+ model: "{workspace}/model.py" # user-defined model
+ dataset_name: dataset_train # select dataset by name
+ thread_num: 1
+#- name: phase2
+# model: "{workspace}/model.py" # user-defined model
+# dataset_name: dataset_infer # select dataset by name
+# thread_num: 1
diff --git a/models/recall/gnn/raw_data/convert_data.py b/models/recall/gnn/data/convert_data.py
similarity index 100%
rename from models/recall/gnn/raw_data/convert_data.py
rename to models/recall/gnn/data/convert_data.py
diff --git a/models/recall/gnn/raw_data/download.py b/models/recall/gnn/data/download.py
similarity index 100%
rename from models/recall/gnn/raw_data/download.py
rename to models/recall/gnn/data/download.py
diff --git a/models/recall/gnn/raw_data/preprocess.py b/models/recall/gnn/data/preprocess.py
similarity index 100%
rename from models/recall/gnn/raw_data/preprocess.py
rename to models/recall/gnn/data/preprocess.py
diff --git a/models/recall/gnn/data_process.sh b/models/recall/gnn/data_prepare.sh
similarity index 82%
rename from models/recall/gnn/data_process.sh
rename to models/recall/gnn/data_prepare.sh
index fc7ed827e0368c59cab8134d22f78e2200980f18..00a3dcebb01f33424ed9e9517967e5cb613bee81 100755
--- a/models/recall/gnn/data_process.sh
+++ b/models/recall/gnn/data_prepare.sh
@@ -17,7 +17,7 @@
set -e
echo "begin to download data"
-cd raw_data && python download.py
+cd data && python download.py
mkdir diginetica
python preprocess.py --dataset diginetica
@@ -26,8 +26,10 @@ python convert_data.py --data_dir diginetica
cat diginetica/train.txt | wc -l >> diginetica/config.txt
-mkdir train_data
-mv diginetica/train.txt train_data
+rm -rf train && mkdir train
+mv diginetica/train.txt train
-mkdir test_data
-mv diginetica/test.txt test_data
+rm -rf test && mkdir test
+mv diginetica/test.txt test
+
+mv diginetica/config.txt ./config.txt
diff --git a/models/recall/gnn/evaluate_reader.py b/models/recall/gnn/evaluate_reader.py
index b26ea8fa9fc347ce402575104dcfa6de23aa80fc..d7b24c963f197417b37ef7aa9f702f45ab669724 100755
--- a/models/recall/gnn/evaluate_reader.py
+++ b/models/recall/gnn/evaluate_reader.py
@@ -21,10 +21,10 @@ from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
-class EvaluateReader(Reader):
+class TrainReader(Reader):
def init(self):
- self.batch_size = envs.get_global_env("batch_size", None,
- "evaluate.reader")
+ self.batch_size = envs.get_global_env(
+ "dataset.dataset_infer.batch_size")
self.input = []
self.length = None
diff --git a/models/recall/gnn/model.py b/models/recall/gnn/model.py
index 027fbb721131e203ed22485b4d8f9bd96b8ed3a3..74ffd7866d92824a0d23aead5bed3d143727381b 100755
--- a/models/recall/gnn/model.py
+++ b/models/recall/gnn/model.py
@@ -25,74 +25,65 @@ from paddlerec.core.model import Model as ModelBase
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
- self.init_config()
- def init_config(self):
- self._fetch_interval = 1
- self.items_num, self.ins_num = self.config_read(
- envs.get_global_env("hyper_parameters.config_path", None,
- self._namespace))
- self.train_batch_size = envs.get_global_env("batch_size", None,
- "train.reader")
- self.evaluate_batch_size = envs.get_global_env("batch_size", None,
- "evaluate.reader")
- self.hidden_size = envs.get_global_env(
- "hyper_parameters.sparse_feature_dim", None, self._namespace)
- self.step = envs.get_global_env(
- "hyper_parameters.gnn_propogation_steps", None, self._namespace)
+ def _init_hyper_parameters(self):
+ self.learning_rate = envs.get_global_env(
+ "hyper_parameters.optimizer.learning_rate")
+ self.decay_steps = envs.get_global_env(
+ "hyper_parameters.optimizer.decay_steps")
+ self.decay_rate = envs.get_global_env(
+ "hyper_parameters.optimizer.decay_rate")
+ self.l2 = envs.get_global_env("hyper_parameters.optimizer.l2")
+
+ self.dict_size = envs.get_global_env(
+ "hyper_parameters.sparse_feature_number")
+ self.corpus_size = envs.get_global_env("hyper_parameters.corpus_size")
- def config_read(self, config_path=None):
- if config_path is None:
- raise ValueError(
- "please set train.model.hyper_parameters.config_path at first")
- with open(config_path, "r") as fin:
- item_nums = int(fin.readline().strip())
- ins_nums = int(fin.readline().strip())
- return item_nums, ins_nums
+ self.train_batch_size = envs.get_global_env(
+ "dataset.dataset_train.batch_size")
+ self.evaluate_batch_size = envs.get_global_env(
+ "dataset.dataset_infer.batch_size")
- def input(self, bs):
- self.items = fluid.data(
+ self.hidden_size = envs.get_global_env(
+ "hyper_parameters.sparse_feature_dim")
+ self.step = envs.get_global_env(
+ "hyper_parameters.gnn_propogation_steps")
+
+ def input_data(self, is_infer=False, **kwargs):
+ if is_infer:
+ bs = self.evaluate_batch_size
+ else:
+ bs = self.train_batch_size
+ items = fluid.data(
name="items", shape=[bs, -1],
dtype="int64") # [batch_size, uniq_max]
- self.seq_index = fluid.data(
+ seq_index = fluid.data(
name="seq_index", shape=[bs, -1, 2],
dtype="int32") # [batch_size, seq_max, 2]
- self.last_index = fluid.data(
+ last_index = fluid.data(
name="last_index", shape=[bs, 2], dtype="int32") # [batch_size, 2]
- self.adj_in = fluid.data(
+ adj_in = fluid.data(
name="adj_in", shape=[bs, -1, -1],
dtype="float32") # [batch_size, seq_max, seq_max]
- self.adj_out = fluid.data(
+ adj_out = fluid.data(
name="adj_out", shape=[bs, -1, -1],
dtype="float32") # [batch_size, seq_max, seq_max]
- self.mask = fluid.data(
+ mask = fluid.data(
name="mask", shape=[bs, -1, 1],
dtype="float32") # [batch_size, seq_max, 1]
- self.label = fluid.data(
+ label = fluid.data(
name="label", shape=[bs, 1], dtype="int64") # [batch_size, 1]
- res = [
- self.items, self.seq_index, self.last_index, self.adj_in,
- self.adj_out, self.mask, self.label
- ]
+ res = [items, seq_index, last_index, adj_in, adj_out, mask, label]
return res
- def train_input(self):
- res = self.input(self.train_batch_size)
- self._data_var = res
-
- use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
- False, self._namespace)
+ def net(self, inputs, is_infer=False):
+ if is_infer:
+ bs = self.evaluate_batch_size
+ else:
+ bs = self.train_batch_size
- if self._platform != "LINUX" or use_dataloader:
- self._data_loader = fluid.io.DataLoader.from_generator(
- feed_list=self._data_var,
- capacity=256,
- use_double_buffer=False,
- iterable=False)
-
- def net(self, items_num, hidden_size, step, bs):
- stdv = 1.0 / math.sqrt(hidden_size)
+ stdv = 1.0 / math.sqrt(self.hidden_size)
def embedding_layer(input,
table_name,
@@ -100,22 +91,22 @@ class Model(ModelBase):
initializer_instance=None):
emb = fluid.embedding(
input=input,
- size=[items_num, emb_dim],
+ size=[self.dict_size, emb_dim],
param_attr=fluid.ParamAttr(
- name=table_name, initializer=initializer_instance), )
+ name=table_name, initializer=initializer_instance))
return emb
sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv)
- items_emb = embedding_layer(self.items, "emb", hidden_size,
+ items_emb = embedding_layer(inputs[0], "emb", self.hidden_size,
sparse_initializer)
pre_state = items_emb
- for i in range(step):
+ for i in range(self.step):
pre_state = layers.reshape(
- x=pre_state, shape=[bs, -1, hidden_size])
+ x=pre_state, shape=[bs, -1, self.hidden_size])
state_in = layers.fc(
input=pre_state,
name="state_in",
- size=hidden_size,
+ size=self.hidden_size,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
@@ -127,7 +118,7 @@ class Model(ModelBase):
state_out = layers.fc(
input=pre_state,
name="state_out",
- size=hidden_size,
+ size=self.hidden_size,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
@@ -137,33 +128,34 @@ class Model(ModelBase):
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
- state_adj_in = layers.matmul(self.adj_in,
+ state_adj_in = layers.matmul(inputs[3],
state_in) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(
- self.adj_out, state_out) # [batch_size, uniq_max, h]
+ inputs[4], state_out) # [batch_size, uniq_max, h]
gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)
gru_input = layers.reshape(
- x=gru_input, shape=[-1, hidden_size * 2])
+ x=gru_input, shape=[-1, self.hidden_size * 2])
gru_fc = layers.fc(input=gru_input,
name="gru_fc",
- size=3 * hidden_size,
+ size=3 * self.hidden_size,
bias_attr=False)
pre_state, _, _ = fluid.layers.gru_unit(
input=gru_fc,
hidden=layers.reshape(
- x=pre_state, shape=[-1, hidden_size]),
- size=3 * hidden_size)
+ x=pre_state, shape=[-1, self.hidden_size]),
+ size=3 * self.hidden_size)
- final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size])
- seq = layers.gather_nd(final_state, self.seq_index)
- last = layers.gather_nd(final_state, self.last_index)
+ final_state = layers.reshape(
+ pre_state, shape=[bs, -1, self.hidden_size])
+ seq = layers.gather_nd(final_state, inputs[1])
+ last = layers.gather_nd(final_state, inputs[2])
seq_fc = layers.fc(
input=seq,
name="seq_fc",
- size=hidden_size,
+ size=self.hidden_size,
bias_attr=False,
act=None,
num_flatten_dims=2,
@@ -171,7 +163,7 @@ class Model(ModelBase):
low=-stdv, high=stdv))) # [batch_size, seq_max, h]
last_fc = layers.fc(input=last,
name="last_fc",
- size=hidden_size,
+ size=self.hidden_size,
bias_attr=False,
act=None,
num_flatten_dims=1,
@@ -184,7 +176,7 @@ class Model(ModelBase):
add = layers.elementwise_add(seq_fc_t,
last_fc) # [seq_max, batch_size, h]
b = layers.create_parameter(
- shape=[hidden_size],
+ shape=[self.hidden_size],
dtype='float32',
default_initializer=fluid.initializer.Constant(value=0.0)) # [h]
add = layers.elementwise_add(add, b) # [seq_max, batch_size, h]
@@ -202,7 +194,7 @@ class Model(ModelBase):
bias_attr=False,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, 1]
- weight *= self.mask
+ weight *= inputs[5]
weight_mask = layers.elementwise_mul(
seq, weight, axis=0) # [batch_size, seq_max, h]
global_attention = layers.reduce_sum(
@@ -213,7 +205,7 @@ class Model(ModelBase):
final_attention_fc = layers.fc(
input=final_attention,
name="final_attention_fc",
- size=hidden_size,
+ size=self.hidden_size,
bias_attr=False,
act=None,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
@@ -225,7 +217,7 @@ class Model(ModelBase):
# dtype="int64",
# persistable=True,
# name="all_vocab")
- all_vocab = np.arange(1, items_num).reshape((-1)).astype('int32')
+ all_vocab = np.arange(1, self.dict_size).reshape((-1)).astype('int32')
all_vocab = fluid.layers.cast(
x=fluid.layers.assign(all_vocab), dtype='int64')
@@ -235,63 +227,32 @@ class Model(ModelBase):
name="emb",
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
- size=[items_num, hidden_size]) # [all_vocab, h]
+ size=[self.dict_size, self.hidden_size]) # [all_vocab, h]
logits = layers.matmul(
x=final_attention_fc, y=all_emb,
transpose_y=True) # [batch_size, all_vocab]
softmax = layers.softmax_with_cross_entropy(
- logits=logits, label=self.label) # [batch_size, 1]
+ logits=logits, label=inputs[6]) # [batch_size, 1]
self.loss = layers.reduce_mean(softmax) # [1]
- self.acc = layers.accuracy(input=logits, label=self.label, k=20)
+ self.acc = layers.accuracy(input=logits, label=inputs[6], k=20)
- def avg_loss(self):
self._cost = self.loss
+ if is_infer:
+ self._infer_results['acc'] = self.acc
+ self._infer_results['loss'] = self.loss
+ return
- def metrics(self):
self._metrics["LOSS"] = self.loss
self._metrics["train_acc"] = self.acc
- def train_net(self):
- self.train_input()
- self.net(self.items_num, self.hidden_size, self.step,
- self.train_batch_size)
- self.avg_loss()
- self.metrics()
-
def optimizer(self):
- learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
- None, self._namespace)
- step_per_epoch = self.ins_num // self.train_batch_size
- decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
- self._namespace)
- decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
- self._namespace)
- l2 = envs.get_global_env("hyper_parameters.l2", None, self._namespace)
+ step_per_epoch = self.corpus_size // self.train_batch_size
optimizer = fluid.optimizer.Adam(
learning_rate=fluid.layers.exponential_decay(
- learning_rate=learning_rate,
- decay_steps=decay_steps * step_per_epoch,
- decay_rate=decay_rate),
+ learning_rate=self.learning_rate,
+ decay_steps=self.decay_steps * step_per_epoch,
+ decay_rate=self.decay_rate),
regularization=fluid.regularizer.L2DecayRegularizer(
- regularization_coeff=l2))
-
+ regularization_coeff=self.l2))
return optimizer
-
- def infer_input(self):
- self._reader_namespace = "evaluate.reader"
- res = self.input(self.evaluate_batch_size)
- self._infer_data_var = res
-
- self._infer_data_loader = fluid.io.DataLoader.from_generator(
- feed_list=self._infer_data_var,
- capacity=64,
- use_double_buffer=False,
- iterable=False)
-
- def infer_net(self):
- self.infer_input()
- self.net(self.items_num, self.hidden_size, self.step,
- self.evaluate_batch_size)
- self._infer_results['acc'] = self.acc
- self._infer_results['loss'] = self.loss
diff --git a/models/recall/gnn/reader.py b/models/recall/gnn/reader.py
index 68170f09a7a7c84547a67f970b6e127de40b0ccc..fd54277d43e3387ceb51a5bb0759e1f4c1713cfc 100755
--- a/models/recall/gnn/reader.py
+++ b/models/recall/gnn/reader.py
@@ -23,9 +23,8 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
- self.batch_size = envs.get_global_env("batch_size", None,
- "train.reader")
-
+ self.batch_size = envs.get_global_env(
+ "dataset.dataset_train.batch_size")
self.input = []
self.length = None
diff --git a/models/recall/readme.md b/models/recall/readme.md
index 421df1315dc22396f2ff3bb5aec99508435e2c8d..c51693fdb11613984c1abbf3a48a5306b40e8b61 100755
--- a/models/recall/readme.md
+++ b/models/recall/readme.md
@@ -57,8 +57,8 @@
-## 使用教程 -### 训练 预测 +## 使用教程(快速开始) +### ```shell python -m paddlerec.run -m paddlerec.models.recall.word2vec # word2vec python -m paddlerec.run -m paddlerec.models.recall.ssr # ssr @@ -67,6 +67,40 @@ python -m paddlerec.run -m paddlerec.models.recall.gnn # gnn python -m paddlerec.run -m paddlerec.models.recall.ncf # ncf python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn ``` + +## 使用教程(复现论文) +为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据,并且调整了batch_size等超参以便在样例数据上更加友好的显示训练&测试日志。如果需要复现readme中的效果请按照如下表格调整batch_size等超参,并使用提供的脚本下载对应数据集以及数据预处理。 + +| 模型 | batch_size | thread_num | epoch_num | +| :---: | :---: | :---: | :---: | +| Word2Vec | 100 | 5 | 5 | +| GNN | 100 | 1 | 30 | +| GRU4REC | 500 | 1 | 10 | + +### 数据处理 +参考每个模型目录数据下载&预处理脚本。 +```bash +sh data_prepare.sh +``` + +### 训练 +```bash +cd modles/recall/gnn # 进入选定好的召回模型的目录 以gnn为例 +python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配置文件,使用自定义配置 +``` + +### 预测 +``` +# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 +# 修改对应模型的config.yaml,mode配置infer_runner +# 示例: mode: train_runner -> mode: infer_runner +# infer_runner中 class配置为 class: single_infer +# 修改phase阶段为infer的配置,参照config注释 + +# 修改完config.yaml后 执行: +python -m paddlerec.run -m ./config.yaml # 以gnn为例 +``` + ## 效果对比 ### 模型效果列表 diff --git a/models/recall/word2vec/config.yaml b/models/recall/word2vec/config.yaml index 9bb5c4d3fe42bc2385ff22ad150924ad5a3a59cd..e2785555c32d3a2300c36a2eba6e8b030fc172b9 100755 --- a/models/recall/word2vec/config.yaml +++ b/models/recall/word2vec/config.yaml @@ -11,51 +11,70 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -evaluate: - workspace: "paddlerec.models.recall.word2vec" +workspace: "paddlerec.models.recall.word2vec" - evaluate_only: False - evaluate_model_path: "" - - reader: - batch_size: 50 - class: "{workspace}/w2v_evaluate_reader.py" - test_data_path: "{workspace}/data/test" - word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt" +# list of dataset +dataset: +- name: dataset_train # name of dataset to distinguish different datasets + batch_size: 100 + type: DataLoader # or QueueDataset + data_path: "{workspace}/data/train" + word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt" + data_converter: "{workspace}/w2v_reader.py" +- name: dataset_infer # name + batch_size: 50 + type: DataLoader # or QueueDataset + data_path: "{workspace}/data/test" + word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt" + data_converter: "{workspace}/w2v_evaluate_reader.py" -train: - trainer: - # for cluster training - strategy: "async" +hyper_parameters: + optimizer: + learning_rate: 1.0 + decay_steps: 100000 + decay_rate: 0.999 + class: sgd + strategy: async + sparse_feature_number: 354051 + sparse_feature_dim: 300 + with_shuffle_batch: False + neg_num: 5 + window_size: 5 +# select runner by name +mode: train_runner +# config of each runner. +# runner is a kind of paddle training class, which wraps the train/infer process. +runner: +- name: train_runner + class: single_train + # num of epochs epochs: 2 - workspace: "paddlerec.models.recall.word2vec" + # device to run training or infer + device: cpu + save_checkpoint_interval: 1 # save model interval of epochs + save_inference_interval: 1 # save inference + save_checkpoint_path: "increment" # save checkpoint path + save_inference_path: "inference" # save inference path + save_inference_feed_varnames: [] # feed vars of save inference + save_inference_fetch_varnames: [] # fetch vars of save inference + init_model_path: "" # load model path + fetch_period: 10 +- name: infer_runner + class: single_infer + # num of epochs + epochs: 1 + # device to run training or infer + device: cpu + init_model_path: "increment/0" # load model path - reader: - batch_size: 100 - class: "{workspace}/w2v_reader.py" - train_data_path: "{workspace}/data/train" - word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt" - - model: - models: "{workspace}/model.py" - hyper_parameters: - sparse_feature_number: 85 - sparse_feature_dim: 300 - with_shuffle_batch: False - neg_num: 5 - window_size: 5 - learning_rate: 1.0 - decay_steps: 100000 - decay_rate: 0.999 - optimizer: sgd - - save: - increment: - dirname: "increment" - epoch_interval: 1 - save_last: True - inference: - dirname: "inference" - epoch_interval: 1 - save_last: True +# runner will run all the phase in each epoch +phase: +- name: phase1 + model: "{workspace}/model.py" # user-defined model + dataset_name: dataset_train # select dataset by name + thread_num: 1 +#- name: phase2 +# model: "{workspace}/model.py" # user-defined model +# dataset_name: dataset_infer # select dataset by name +# thread_num: 1 diff --git a/models/recall/word2vec/prepare_data.sh b/models/recall/word2vec/data_prepare.sh similarity index 74% rename from models/recall/word2vec/prepare_data.sh rename to models/recall/word2vec/data_prepare.sh index cfd067350ce1d33112806ab72ca78222381a86f4..eb1665240f58ae67afb942885932886e59e5a0a3 100755 --- a/models/recall/word2vec/prepare_data.sh +++ b/models/recall/word2vec/data_prepare.sh @@ -22,16 +22,17 @@ tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ raw_data/ # preprocess data -python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/test_build_dict -python preprocess.py --filter_corpus --dict_path raw_data/test_build_dict --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --min_count 5 --downsample 0.001 -mkdir thirdparty -mv raw_data/test_build_dict thirdparty/ -mv raw_data/test_build_dict_word_to_id_ thirdparty/ +python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/word_count_dict.txt +python preprocess.py --filter_corpus --dict_path raw_data/word_count_dict.txt --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --min_count 5 --downsample 0.001 +mv raw_data/word_count_dict.txt data/dict/ +mv raw_data/word_id_dict.txt data/dict/ -python preprocess.py --data_resplit --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=train_data +rm -rf data/train/* +rm -rf data/test/* +python preprocess.py --data_resplit --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=data/train # download test data wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar tar xzvf test_dir.tar -C raw_data -mv raw_data/data/test_dir test_data/ +mv raw_data/data/test_dir/* data/test/ rm -rf raw_data diff --git a/models/recall/word2vec/model.py b/models/recall/word2vec/model.py index fefc89043c2f926f37318e1094b9cdf98dd6235a..43417e605b021a04e28023567c53c417a77c8ad7 100755 --- a/models/recall/word2vec/model.py +++ b/models/recall/word2vec/model.py @@ -23,45 +23,50 @@ class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) - def input(self): - neg_num = int( - envs.get_global_env("hyper_parameters.neg_num", None, - self._namespace)) - self.input_word = fluid.data( + def _init_hyper_parameters(self): + self.is_distributed = True if envs.get_trainer( + ) == "CtrTrainer" else False + self.sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number") + self.sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim") + self.neg_num = envs.get_global_env("hyper_parameters.neg_num") + self.with_shuffle_batch = envs.get_global_env( + "hyper_parameters.with_shuffle_batch") + self.learning_rate = envs.get_global_env( + "hyper_parameters.optimizer.learning_rate") + self.decay_steps = envs.get_global_env( + "hyper_parameters.optimizer.decay_steps") + self.decay_rate = envs.get_global_env( + "hyper_parameters.optimizer.decay_rate") + + def input_data(self, is_infer=False, **kwargs): + if is_infer: + analogy_a = fluid.data( + name="analogy_a", shape=[None], dtype='int64') + analogy_b = fluid.data( + name="analogy_b", shape=[None], dtype='int64') + analogy_c = fluid.data( + name="analogy_c", shape=[None], dtype='int64') + analogy_d = fluid.data( + name="analogy_d", shape=[None], dtype='int64') + return [analogy_a, analogy_b, analogy_c, analogy_d] + + input_word = fluid.data( name="input_word", shape=[None, 1], dtype='int64') - self.true_word = fluid.data( + true_word = fluid.data( name='true_label', shape=[None, 1], dtype='int64') - self._data_var.append(self.input_word) - self._data_var.append(self.true_word) - with_shuffle_batch = bool( - int( - envs.get_global_env("hyper_parameters.with_shuffle_batch", - None, self._namespace))) - if not with_shuffle_batch: - self.neg_word = fluid.data( - name="neg_label", shape=[None, neg_num], dtype='int64') - self._data_var.append(self.neg_word) + if self.with_shuffle_batch: + return [input_word, true_word] - if self._platform != "LINUX": - self._data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._data_var, - capacity=64, - use_double_buffer=False, - iterable=False) + neg_word = fluid.data( + name="neg_label", shape=[None, self.neg_num], dtype='int64') + return [input_word, true_word, neg_word] - def net(self): - is_distributed = True if envs.get_trainer() == "CtrTrainer" else False - neg_num = int( - envs.get_global_env("hyper_parameters.neg_num", None, - self._namespace)) - sparse_feature_number = envs.get_global_env( - "hyper_parameters.sparse_feature_number", None, self._namespace) - sparse_feature_dim = envs.get_global_env( - "hyper_parameters.sparse_feature_dim", None, self._namespace) - with_shuffle_batch = bool( - int( - envs.get_global_env("hyper_parameters.with_shuffle_batch", - None, self._namespace))) + def net(self, inputs, is_infer=False): + if is_infer: + self.infer_net(inputs) + return def embedding_layer(input, table_name, @@ -71,8 +76,8 @@ class Model(ModelBase): emb = fluid.embedding( input=input, is_sparse=True, - is_distributed=is_distributed, - size=[sparse_feature_number, emb_dim], + is_distributed=self.is_distributed, + size=[self.sparse_feature_number, emb_dim], param_attr=fluid.ParamAttr( name=table_name, initializer=initializer_instance), ) if squeeze: @@ -80,44 +85,44 @@ class Model(ModelBase): else: return emb - init_width = 0.5 / sparse_feature_dim + init_width = 0.5 / self.sparse_feature_dim emb_initializer = fluid.initializer.Uniform(-init_width, init_width) emb_w_initializer = fluid.initializer.Constant(value=0.0) - input_emb = embedding_layer(self.input_word, "emb", sparse_feature_dim, + input_emb = embedding_layer(inputs[0], "emb", self.sparse_feature_dim, emb_initializer, True) - true_emb_w = embedding_layer(self.true_word, "emb_w", - sparse_feature_dim, emb_w_initializer, - True) - true_emb_b = embedding_layer(self.true_word, "emb_b", 1, + true_emb_w = embedding_layer(inputs[1], "emb_w", + self.sparse_feature_dim, emb_w_initializer, True) + true_emb_b = embedding_layer(inputs[1], "emb_b", 1, emb_w_initializer, + True) - if with_shuffle_batch: + if self.with_shuffle_batch: neg_emb_w_list = [] - for i in range(neg_num): + for i in range(self.neg_num): neg_emb_w_list.append( fluid.contrib.layers.shuffle_batch( true_emb_w)) # shuffle true_word neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0) neg_emb_w = fluid.layers.reshape( - neg_emb_w_concat, shape=[-1, neg_num, sparse_feature_dim]) + neg_emb_w_concat, + shape=[-1, self.neg_num, self.sparse_feature_dim]) neg_emb_b_list = [] - for i in range(neg_num): + for i in range(self.neg_num): neg_emb_b_list.append( fluid.contrib.layers.shuffle_batch( true_emb_b)) # shuffle true_word neg_emb_b = fluid.layers.concat(neg_emb_b_list, axis=0) neg_emb_b_vec = fluid.layers.reshape( - neg_emb_b, shape=[-1, neg_num]) - + neg_emb_b, shape=[-1, self.neg_num]) else: - neg_emb_w = embedding_layer(self.neg_word, "emb_w", - sparse_feature_dim, emb_w_initializer) - neg_emb_b = embedding_layer(self.neg_word, "emb_b", 1, + neg_emb_w = embedding_layer( + inputs[2], "emb_w", self.sparse_feature_dim, emb_w_initializer) + neg_emb_b = embedding_layer(inputs[2], "emb_b", 1, emb_w_initializer) neg_emb_b_vec = fluid.layers.reshape( - neg_emb_b, shape=[-1, neg_num]) + neg_emb_b, shape=[-1, self.neg_num]) true_logits = fluid.layers.elementwise_add( fluid.layers.reduce_sum( @@ -127,18 +132,22 @@ class Model(ModelBase): true_emb_b) input_emb_re = fluid.layers.reshape( - input_emb, shape=[-1, 1, sparse_feature_dim]) + input_emb, shape=[-1, 1, self.sparse_feature_dim]) neg_matmul = fluid.layers.matmul( input_emb_re, neg_emb_w, transpose_y=True) - neg_logits = fluid.layers.elementwise_add( - fluid.layers.reshape( - neg_matmul, shape=[-1, neg_num]), - neg_emb_b_vec) - - label_ones = fluid.layers.fill_constant_batch_size_like( - true_logits, shape=[-1, 1], value=1.0, dtype='float32') - label_zeros = fluid.layers.fill_constant_batch_size_like( - true_logits, shape=[-1, neg_num], value=0.0, dtype='float32') + neg_matmul_re = fluid.layers.reshape( + neg_matmul, shape=[-1, self.neg_num]) + neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec) + #nce loss + + label_ones = fluid.layers.fill_constant( + shape=[fluid.layers.shape(true_logits)[0], 1], + value=1.0, + dtype='float32') + label_zeros = fluid.layers.fill_constant( + shape=[fluid.layers.shape(true_logits)[0], self.neg_num], + value=0.0, + dtype='float32') true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits, label_ones) @@ -149,7 +158,9 @@ class Model(ModelBase): true_xent, dim=1), fluid.layers.reduce_sum( neg_xent, dim=1)) - self.avg_cost = fluid.layers.reduce_mean(cost) + avg_cost = fluid.layers.reduce_mean(cost) + + self._cost = avg_cost global_right_cnt = fluid.layers.create_global_var( name="global_right_cnt", persistable=True, @@ -164,77 +175,33 @@ class Model(ModelBase): value=0) global_right_cnt.stop_gradient = True global_total_cnt.stop_gradient = True - - def avg_loss(self): - self._cost = self.avg_cost - - def metrics(self): - self._metrics["LOSS"] = self.avg_cost - - def train_net(self): - self.input() - self.net() - self.avg_loss() - self.metrics() + self._metrics["LOSS"] = avg_cost def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", - None, self._namespace) - decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None, - self._namespace) - decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None, - self._namespace) optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( - learning_rate=learning_rate, - decay_steps=decay_steps, - decay_rate=decay_rate, + learning_rate=self.learning_rate, + decay_steps=self.decay_steps, + decay_rate=self.decay_rate, staircase=True)) return optimizer - def analogy_input(self): - sparse_feature_number = envs.get_global_env( - "hyper_parameters.sparse_feature_number", None, self._namespace) - self.analogy_a = fluid.data( - name="analogy_a", shape=[None], dtype='int64') - self.analogy_b = fluid.data( - name="analogy_b", shape=[None], dtype='int64') - self.analogy_c = fluid.data( - name="analogy_c", shape=[None], dtype='int64') - self.analogy_d = fluid.data( - name="analogy_d", shape=[None], dtype='int64') - self._infer_data_var = [ - self.analogy_a, self.analogy_b, self.analogy_c, self.analogy_d - ] - - self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, - capacity=64, - use_double_buffer=False, - iterable=False) - - def infer_net(self): - sparse_feature_dim = envs.get_global_env( - "hyper_parameters.sparse_feature_dim", None, self._namespace) - sparse_feature_number = envs.get_global_env( - "hyper_parameters.sparse_feature_number", None, self._namespace) - + def infer_net(self, inputs): def embedding_layer(input, table_name, initializer_instance=None): emb = fluid.embedding( input=input, - size=[sparse_feature_number, sparse_feature_dim], + size=[self.sparse_feature_number, self.sparse_feature_dim], param_attr=table_name) return emb - self.analogy_input() - all_label = np.arange(sparse_feature_number).reshape( - sparse_feature_number).astype('int32') + all_label = np.arange(self.sparse_feature_number).reshape( + self.sparse_feature_number).astype('int32') self.all_label = fluid.layers.cast( x=fluid.layers.assign(all_label), dtype='int64') emb_all_label = embedding_layer(self.all_label, "emb") - emb_a = embedding_layer(self.analogy_a, "emb") - emb_b = embedding_layer(self.analogy_b, "emb") - emb_c = embedding_layer(self.analogy_c, "emb") + emb_a = embedding_layer(inputs[0], "emb") + emb_b = embedding_layer(inputs[1], "emb") + emb_c = embedding_layer(inputs[2], "emb") target = fluid.layers.elementwise_add( fluid.layers.elementwise_sub(emb_b, emb_a), emb_c) @@ -245,8 +212,7 @@ class Model(ModelBase): values, pred_idx = fluid.layers.topk(input=dist, k=4) label = fluid.layers.expand( fluid.layers.unsqueeze( - self.analogy_d, axes=[1]), - expand_times=[1, 4]) + inputs[3], axes=[1]), expand_times=[1, 4]) label_ones = fluid.layers.fill_constant_batch_size_like( label, shape=[-1, 1], value=1.0, dtype='float32') right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast( diff --git a/models/recall/word2vec/preprocess.py b/models/recall/word2vec/preprocess.py index 6c9ee16cd2d136006dc10e7ce0c970974e8bf2b5..679458e388229c5e3b3a8f3d14d212eb1d263544 100755 --- a/models/recall/word2vec/preprocess.py +++ b/models/recall/word2vec/preprocess.py @@ -162,7 +162,7 @@ def filter_corpus(args): if r_value > keep_prob: continue write_line += str(idx) - write_line += "," + write_line += " " signal = True if signal: write_line = write_line[:-1] + "\n" diff --git a/models/recall/word2vec/w2v_evaluate_reader.py b/models/recall/word2vec/w2v_evaluate_reader.py index 6350c960e61d8ef3580cc4cc605ba24cb5623b0b..9ced00ac12efd71ca0885a3302ab01cfa4e0ecc6 100755 --- a/models/recall/word2vec/w2v_evaluate_reader.py +++ b/models/recall/word2vec/w2v_evaluate_reader.py @@ -20,10 +20,10 @@ from paddlerec.core.reader import Reader from paddlerec.core.utils import envs -class EvaluateReader(Reader): +class TrainReader(Reader): def init(self): - dict_path = envs.get_global_env("word_id_dict_path", None, - "evaluate.reader") + dict_path = envs.get_global_env( + "dataset.dataset_infer.word_id_dict_path") self.word_to_id = dict() self.id_to_word = dict() with io.open(dict_path, 'r', encoding='utf-8') as f: @@ -75,6 +75,8 @@ class EvaluateReader(Reader): def generate_sample(self, line): def reader(): + if ':' in line: + pass features = self.strip_lines(line.lower(), self.word_to_id) features = features.split() yield [('analogy_a', [self.word_to_id[features[0]]]), diff --git a/models/recall/word2vec/w2v_reader.py b/models/recall/word2vec/w2v_reader.py index 9b3e69127055118bbc16b30eaac63f9a282bd1eb..15bbd9b0b870adf7bd6daa07469a0a02cd9db079 100755 --- a/models/recall/word2vec/w2v_reader.py +++ b/models/recall/word2vec/w2v_reader.py @@ -40,14 +40,12 @@ class NumpyRandomInt(object): class TrainReader(Reader): def init(self): - dict_path = envs.get_global_env("word_count_dict_path", None, - "train.reader") - self.window_size = envs.get_global_env("hyper_parameters.window_size", - None, "train.model") - self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None, - "train.model") + dict_path = envs.get_global_env( + "dataset.dataset_train.word_count_dict_path") + self.window_size = envs.get_global_env("hyper_parameters.window_size") + self.neg_num = envs.get_global_env("hyper_parameters.neg_num") self.with_shuffle_batch = envs.get_global_env( - "hyper_parameters.with_shuffle_batch", None, "train.model") + "hyper_parameters.with_shuffle_batch") self.random_generator = NumpyRandomInt(1, self.window_size + 1) self.cs = None