提交 7a3ec4e6 编写于 作者: T tangwei

for mat

上级 801dfd34
......@@ -29,4 +29,3 @@ class Engine:
@abc.abstractmethod
def run(self):
pass
......@@ -20,7 +20,6 @@ import os
import sys
import subprocess
from paddlerec.core.engine.engine import Engine
from paddlerec.core.utils import envs
......
......@@ -53,7 +53,7 @@ class Metric(object):
pass
@abc.abstractmethod
def get_result_to_string(self):
def __str__(self):
"""
Return:
result(string) : calculate result with string format, for output
......
......@@ -200,7 +200,7 @@ class AUCMetric(Metric):
""" """
return self._result
def get_result_to_string(self):
def __str__(self):
""" """
result = self.get_result()
result_str = "%s AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f RMSE=%.6f " \
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
trainer implement.
↗ (single/cluster) CtrTrainer
Trainer
↗ (for single training) SingleTrainer/TDMSingleTrainer
↘ TranspilerTrainer → (for cluster training) ClusterTrainer/TDMClusterTrainer
↘ (for online learning training) OnlineLearningTrainer
"""
......@@ -23,7 +23,7 @@ from paddlerec.core.utils import envs
from paddlerec.core.trainer import Trainer
class CtrPaddleTrainer(Trainer):
class CtrTrainer(Trainer):
"""R
"""
......
......@@ -72,7 +72,7 @@ def worker_numric_max(value, env="mpi"):
return wroker_numric_opt(value, env, "max")
class CtrPaddleTrainer(Trainer):
class CtrTrainer(Trainer):
"""R
"""
......
......@@ -31,7 +31,7 @@ from paddlerec.core.utils import envs
from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer
class ClusterTrainer(TranspileTrainer):
class OnlineLearningTrainer(TranspileTrainer):
def processor_register(self):
role = PaddleCloudRoleMaker()
fleet.init(role)
......
......@@ -36,7 +36,8 @@ class SingleTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None,
"train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train)
else:
self.regist_context_processor('train_pass', self.dataloader_train)
......@@ -122,8 +123,8 @@ class SingleTrainer(TranspileTrainer):
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time()
times = end_time-begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times))
times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins / times))
self.save(i, "train", is_fleet=False)
context['status'] = 'infer_pass'
......
......@@ -27,7 +27,6 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import f
from paddlerec.core.utils import envs
from paddlerec.core.trainers.cluster_trainer import ClusterTrainer
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
......
......@@ -24,7 +24,6 @@ import paddle.fluid as fluid
from paddlerec.core.trainers.single_trainer import SingleTrainer
from paddlerec.core.utils import envs
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
......
......@@ -22,7 +22,7 @@ from paddlerec.core.utils import fs as fs
from paddlerec.core.utils import util as util
class Dataset(object):
class DatasetHolder(object):
"""
Dataset Base
"""
......@@ -62,7 +62,7 @@ class Dataset(object):
pass
class TimeSplitDataset(Dataset):
class TimeSplitDatasetHolder(DatasetHolder):
"""
Dataset with time split dir. root_path/$DAY/$HOUR
"""
......@@ -142,16 +142,6 @@ class TimeSplitDataset(Dataset):
data_time = data_time + datetime.timedelta(minutes=self._split_interval)
return data_file_list
class FluidTimeSplitDataset(TimeSplitDataset):
"""
A Dataset with time split for PaddleFluid
"""
def __init__(self, config):
""" """
TimeSplitDataset.__init__(self, config)
def _alloc_dataset(self, file_list):
""" """
dataset = fluid.DatasetFactory().create_dataset(self._config['dataset_type'])
......
......@@ -95,11 +95,12 @@ class FileHandler(object):
"""
A Smart file handler. auto judge local/afs by path
"""
def __init__(self, config):
"""R
"""
if 'fs_name' in config:
hadoop_home="$HADOOP_HOME"
hadoop_home = "$HADOOP_HOME"
hdfs_configs = {
"hadoop.job.ugi": config['fs_ugi'],
"fs.default.name": config['fs_name']
......@@ -132,7 +133,8 @@ class FileHandler(object):
if mode.find('a') >= 0:
org_content = self._hdfs_client.cat(dest_path)
content = content + org_content
self._local_fs_client.write(content, temp_local_file, mode) #fleet hdfs_client only support upload, so write tmp file
self._local_fs_client.write(content, temp_local_file,
mode) # fleet hdfs_client only support upload, so write tmp file
self._hdfs_client.delete(dest_path + ".tmp")
self._hdfs_client.upload(dest_path + ".tmp", temp_local_file)
self._hdfs_client.delete(dest_path + ".bak")
......
......@@ -38,6 +38,7 @@ class TrainReader(Reader):
data = [int(i) for i in data]
label = [int(i) for i in label]
seq_len = [int(i) for i in seq_len]
print >>sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)])
print >> sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)])
yield [('data', data), ('label', label), ('seq_len', seq_len)]
return data_iter
......@@ -18,6 +18,7 @@ import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
from paddlerec.core.model import Model as ModelBase
from paddlerec.core.utils import envs
class Model(ModelBase):
......@@ -25,14 +26,13 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
self.cost = None
self.metrics = {}
self.vocab_text_size = 11447#envs.get_global_env("vocab_text_size", None, self._namespace)
self.vocab_tag_size = 4#envs.get_global_env("vocab_tag_size", None, self._namespace)
self.emb_dim = 10#envs.get_global_env("emb_dim", None, self._namespace)
self.hid_dim = 1000#envs.get_global_env("hid_dim", None, self._namespace)
self.win_size = 5#envs.get_global_env("win_size", None, self._namespace)
self.margin = 0.1#envs.get_global_env("margin", None, self._namespace)
self.neg_size = 3#envs.get_global_env("neg_size", None, self._namespace)
print self.emb_dim
self.vocab_text_size = envs.get_global_env("vocab_text_size", None, self._namespace)
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None, self._namespace)
self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace)
self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace)
self.win_size = envs.get_global_env("win_size", None, self._namespace)
self.margin = envs.get_global_env("margin", None, self._namespace)
self.neg_size = envs.get_global_env("neg_size", None, self._namespace)
def train_net(self):
""" network definition """
......@@ -96,11 +96,9 @@ class Model(ModelBase):
return self.metrics
def optimizer(self):
learning_rate = 0.01#envs.get_global_env("hyper_parameters.base_lr", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.base_lr", None, self._namespace)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
#sgd_optimizer.minimize(avg_cost)
return sgd_optimizer
def infer_net(self, parameter_list):
self.train_net()
......@@ -19,6 +19,7 @@ import numpy as np
from paddlerec.core.reader import Reader
class TrainReader(Reader):
def init(self):
pass
......@@ -46,9 +47,6 @@ class TrainReader(Reader):
neg_index = rand_i
neg_tag.append(neg_index)
sum_n += 1
# if n > 0 and len(text) > n:
# #yield None
# return None, None, None
return text, pos_tag, neg_tag
def generate_sample(self, line):
......@@ -58,4 +56,5 @@ class TrainReader(Reader):
yield None
return
yield [('text', text), ('pos_tag', pos_tag), ('neg_tag', neg_tag)]
return data_iter
......@@ -28,7 +28,8 @@ class Model(ModelBase):
self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i in range(Neg)]
self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i
in range(Neg)]
self._data_var.append(self.query)
self._data_var.append(self.doc_pos)
for input in self.doc_negs:
......@@ -38,7 +39,6 @@ class Model(ModelBase):
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
def net(self, is_infer=False):
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace)
hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None, self._namespace)
......@@ -46,7 +46,7 @@ class Model(ModelBase):
def fc(data, hidden_layers, hidden_acts, names):
fc_inputs = [data]
for i in range(len(hidden_layers)):
xavier=fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i])
xavier = fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i])
out = fluid.layers.fc(input=fc_inputs[-1],
size=hidden_layers[i],
act=hidden_acts[i],
......@@ -65,12 +65,13 @@ class Model(ModelBase):
R_Q_D_ns = []
for i, doc_neg in enumerate(self.doc_negs):
doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, ['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)])
doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts,
['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)])
R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i))
concat_Rs = fluid.layers.concat(input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1)
prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice(prob, axes=[0,1], starts=[0,0], ends=[4, 1])
hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
self.avg_cost = fluid.layers.mean(x=loss)
......
......@@ -37,7 +37,7 @@ class TrainReader(Reader):
neg_docs = []
for i in range(len(features) - 2):
feature_names.append('doc_neg_' + str(i))
neg_docs.append(map(float, features[i+2].split(',')))
neg_docs.append(map(float, features[i + 2].split(',')))
yield zip(feature_names, [query] + [pos_doc] + neg_docs)
......
......@@ -51,4 +51,5 @@ class EvaluateReader(Reader):
else:
output[index][1].append(padding)
yield output
return data_iter
......@@ -14,10 +14,12 @@
import random
class Dataset:
def __init__(self):
pass
class SyntheticDataset(Dataset):
def __init__(self, sparse_feature_dim, query_slot_num, title_slot_num, dataset_size=10000):
# ids are randomly generated
......@@ -50,7 +52,8 @@ class SyntheticDataset(Dataset):
for i in range(self.title_slot_num):
nt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in nt_slot]
nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in
nt_slot]
neg_title_slots += nt_slot
yield query_slots + pos_title_slots + neg_title_slots
else:
......@@ -67,6 +70,7 @@ class SyntheticDataset(Dataset):
def test(self):
return self._reader_creator(False)
if __name__ == '__main__':
sparse_feature_dim = 1000001
query_slots = 1
......
......@@ -19,6 +19,7 @@ import paddle.fluid.layers.control_flow as cf
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
class BowEncoder(object):
""" bow-encoder """
......@@ -94,6 +95,7 @@ class SimpleEncoderFactory(object):
rnn_encode = GrnnEncoder(hidden_size=enc_hid_size)
return rnn_encode
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
......@@ -140,7 +142,8 @@ class Model(ModelBase):
self.nt_slots = [
fluid.data(
name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1, dtype='int64')
name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1,
dtype='int64')
for i in range(len(self.title_encoders))
]
......
......@@ -54,4 +54,5 @@ class TrainReader(Reader):
else:
output[index][1].append(padding)
yield output
return data_iter
......@@ -18,14 +18,14 @@ from collections import defaultdict
from paddlerec.core.reader import Reader
class EvaluateReader(Reader):
def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', '129',
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128',
'129',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301']
self.all_field_id_dict = defaultdict(int)
for i,field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False,i]
for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i]
def generate_sample(self, line):
"""
......@@ -41,10 +41,10 @@ class EvaluateReader(Reader):
cvr = int(features[2])
padding = 0
output = [(field_id,[]) for field_id in self.all_field_id_dict]
output = [(field_id, []) for field_id in self.all_field_id_dict]
for elem in features[4:]:
field_id,feat_id = elem.strip().split(':')
field_id, feat_id = elem.strip().split(':')
if field_id not in self.all_field_id_dict:
continue
self.all_field_id_dict[field_id][0] = True
......@@ -52,7 +52,7 @@ class EvaluateReader(Reader):
output[index][1].append(int(feat_id))
for field_id in self.all_field_id_dict:
visited,index = self.all_field_id_dict[field_id]
visited, index = self.all_field_id_dict[field_id]
if visited:
self.all_field_id_dict[field_id][0] = False
else:
......@@ -60,4 +60,5 @@ class EvaluateReader(Reader):
output.append(('ctr', [ctr]))
output.append(('cvr', [cvr]))
yield output
return reader
......@@ -21,11 +21,12 @@ from paddlerec.core.reader import Reader
class TrainReader(Reader):
def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', '129',
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128',
'129',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301']
self.all_field_id_dict = defaultdict(int)
for i,field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False,i]
for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i]
def generate_sample(self, line):
"""
......@@ -37,25 +38,25 @@ class TrainReader(Reader):
This function needs to be implemented by the user, based on data format
"""
features = line.strip().split(',')
#ctr = list(map(int, features[1]))
#cvr = list(map(int, features[2]))
# ctr = list(map(int, features[1]))
# cvr = list(map(int, features[2]))
ctr = int(features[1])
cvr = int(features[2])
padding = 0
output = [(field_id,[]) for field_id in self.all_field_id_dict]
output = [(field_id, []) for field_id in self.all_field_id_dict]
for elem in features[4:]:
field_id,feat_id = elem.strip().split(':')
field_id, feat_id = elem.strip().split(':')
if field_id not in self.all_field_id_dict:
continue
self.all_field_id_dict[field_id][0] = True
index = self.all_field_id_dict[field_id][1]
#feat_id = list(map(int, feat_id))
# feat_id = list(map(int, feat_id))
output[index][1].append(int(feat_id))
for field_id in self.all_field_id_dict:
visited,index = self.all_field_id_dict[field_id]
visited, index = self.all_field_id_dict[field_id]
if visited:
self.all_field_id_dict[field_id][0] = False
else:
......@@ -63,4 +64,5 @@ class TrainReader(Reader):
output.append(('ctr', [ctr]))
output.append(('cvr', [cvr]))
yield output
return reader
......@@ -23,13 +23,14 @@ class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def fc(self,tag, data, out_dim, active='prelu'):
def fc(self, tag, data, out_dim, active='prelu'):
init_stddev = 1.0
scales = 1.0 / np.sqrt(data.shape[1])
p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag,
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=init_stddev * scales))
initializer=fluid.initializer.NormalInitializer(loc=0.0,
scale=init_stddev * scales))
b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
......@@ -37,13 +38,13 @@ class Model(ModelBase):
size=out_dim,
act=active,
param_attr=p_attr,
bias_attr =b_attr,
bias_attr=b_attr,
name=tag)
return out
def input_data(self):
sparse_input_ids = [
fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0,23)
fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0, 23)
]
label_ctr = fluid.data(name="ctr", shape=[-1, 1], dtype="int64")
label_cvr = fluid.data(name="cvr", shape=[-1, 1], dtype="int64")
......@@ -62,10 +63,11 @@ class Model(ModelBase):
size=[vocab_size, embed_size],
param_attr=fluid.ParamAttr(name='dis_emb',
learning_rate=5,
initializer=fluid.initializer.Xavier(fan_in=embed_size,fan_out=embed_size)
initializer=fluid.initializer.Xavier(
fan_in=embed_size, fan_out=embed_size)
),
is_sparse=True)
field_emb = fluid.layers.sequence_pool(input=feat_emb,pool_type='sum')
field_emb = fluid.layers.sequence_pool(input=feat_emb, pool_type='sum')
emb.append(field_emb)
concat_emb = fluid.layers.concat(emb, axis=1)
......@@ -78,7 +80,7 @@ class Model(ModelBase):
# cvr
cvr_fc1 = self.fc('cvr_fc1', concat_emb, 200, active)
cvr_fc2 = self.fc('cvr_fc2', cvr_fc1, 80, active)
cvr_out = self.fc('cvr_out', cvr_fc2, 2,'softmax')
cvr_out = self.fc('cvr_out', cvr_fc2, 2, 'softmax')
ctr_clk = inputs[-2]
ctcvr_buy = inputs[-1]
......@@ -87,7 +89,7 @@ class Model(ModelBase):
cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2])
ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one)
ctcvr_prop = fluid.layers.concat(input=[1-ctcvr_prop_one,ctcvr_prop_one], axis = 1)
ctcvr_prop = fluid.layers.concat(input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1)
auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk)
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy)
......@@ -97,25 +99,21 @@ class Model(ModelBase):
self._infer_results["AUC_ctcvr"] = auc_ctcvr
return
loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk)
loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy)
cost = loss_ctr + loss_ctcvr
avg_cost = fluid.layers.mean(cost)
self._cost = avg_cost
self._metrics["AUC_ctr"] = auc_ctr
self._metrics["BATCH_AUC_ctr"] = batch_auc_ctr
self._metrics["AUC_ctcvr"] = auc_ctcvr
self._metrics["BATCH_AUC_ctcvr"] = batch_auc_ctcvr
def train_net(self):
input_data = self.input_data()
self.net(input_data)
def infer_net(self):
self._infer_data_var = self.input_data()
self._infer_data_loader = fluid.io.DataLoader.from_generator(
......
......@@ -43,8 +43,8 @@ class TrainReader(Reader):
label_marital = [1, 0]
elif int(l[0]) == 1:
label_marital = [0, 1]
#label_income = np.array(label_income)
#label_marital = np.array(label_marital)
# label_income = np.array(label_income)
# label_marital = np.array(label_marital)
feature_name = ["input", "label_income", "label_marital"]
yield zip(feature_name, [data] + [label_income] + [label_marital])
......
......@@ -49,8 +49,7 @@ class Model(ModelBase):
name='expert_' + str(i))
expert_outputs.append(expert_output)
expert_concat = fluid.layers.concat(expert_outputs, axis=1)
expert_concat = fluid.layers.reshape(expert_concat,[-1, expert_num, expert_size])
expert_concat = fluid.layers.reshape(expert_concat, [-1, expert_num, expert_size])
# g^{k}(x) = activation(W_{gk} * x + b), where activation is softmax according to the paper
output_layers = []
......@@ -78,19 +77,22 @@ class Model(ModelBase):
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, label=fluid.layers.cast(x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, label=fluid.layers.cast(x=label_marital_1, dtype='int64'))
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income,
label=fluid.layers.cast(x=label_income_1,
dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1,
dtype='int64'))
if is_infer:
self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital
return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income,soft_label = True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital,soft_label = True)
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True)
avg_cost_income = fluid.layers.mean(x=cost_income)
avg_cost_marital = fluid.layers.mean(x=cost_marital)
......@@ -103,10 +105,8 @@ class Model(ModelBase):
self._metrics["AUC_marital"] = auc_marital
self._metrics["BATCH_AUC_marital"] = batch_auc_2
def train_net(self):
self.MMOE()
def infer_net(self):
self.MMOE(is_infer=True)
......@@ -43,8 +43,8 @@ class TrainReader(Reader):
label_marital = [1, 0]
elif int(l[0]) == 1:
label_marital = [0, 1]
#label_income = np.array(label_income)
#label_marital = np.array(label_marital)
# label_income = np.array(label_income)
# label_marital = np.array(label_marital)
feature_name = ["input", "label_income", "label_marital"]
yield zip(feature_name, [data] + [label_income] + [label_marital])
......
......@@ -46,7 +46,6 @@ class Model(ModelBase):
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='bottom_output')
# Build tower layer from bottom layer
output_layers = []
for index in range(tower_nums):
......@@ -60,23 +59,26 @@ class Model(ModelBase):
name='output_layer_' + str(index))
output_layers.append(output_layer)
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, label=fluid.layers.cast(x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, label=fluid.layers.cast(x=label_marital_1, dtype='int64'))
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income,
label=fluid.layers.cast(x=label_income_1,
dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1,
dtype='int64'))
if is_infer:
self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital
return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income,soft_label = True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital,soft_label = True)
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True)
cost = fluid.layers.elementwise_add(cost_income, cost_marital, axis=1)
avg_cost = fluid.layers.mean(x=cost)
......@@ -87,10 +89,8 @@ class Model(ModelBase):
self._metrics["AUC_marital"] = auc_marital
self._metrics["BATCH_AUC_marital"] = batch_auc_2
def train_net(self):
self.model()
def infer_net(self):
self.model(is_infer=True)
......@@ -21,7 +21,6 @@ try:
except ImportError:
import pickle
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
......@@ -85,6 +84,7 @@ class TrainReader(Reader):
"""
Read the data line by line and process it as a dictionary
"""
def data_iter():
label_feat_list = self._process_line(line)
yield list(zip(self.label_feat_names, label_feat_list))
......
......@@ -142,7 +142,6 @@ class Model(ModelBase):
self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc_var
# logloss
logloss = fluid.layers.log_loss(self.prob, self.target_input)
self.avg_logloss = fluid.layers.reduce_mean(logloss)
......
......@@ -67,6 +67,7 @@ class TrainReader(Reader):
"""
Read the data line by line and process it as a dictionary
"""
def data_iter():
feat_idx, feat_value, label = self._process_line(line)
yield [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]
......
......@@ -33,10 +33,11 @@ class Model(ModelBase):
# ------------------------- network input --------------------------
num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace)
raw_feat_idx = fluid.data(name='feat_idx', shape=[None, num_field], dtype='int64') # None * num_field(defalut:39)
raw_feat_idx = fluid.data(name='feat_idx', shape=[None, num_field],
dtype='int64') # None * num_field(defalut:39)
raw_feat_value = fluid.data(name='feat_value', shape=[None, num_field], dtype='float32') # None * num_field
self.label = fluid.data(name='label', shape=[None, 1], dtype='float32') # None * 1
feat_idx = fluid.layers.reshape(raw_feat_idx,[-1, 1]) # (None * num_field) * 1
feat_idx = fluid.layers.reshape(raw_feat_idx, [-1, 1]) # (None * num_field) * 1
feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
# ------------------------- set _data_var --------------------------
......@@ -48,7 +49,7 @@ class Model(ModelBase):
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
#------------------------- first order term --------------------------
# ------------------------- first order term --------------------------
reg = envs.get_global_env("hyper_parameters.reg", 1e-4, self._namespace)
first_weights_re = fluid.embedding(
......@@ -66,7 +67,7 @@ class Model(ModelBase):
first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1
y_first_order = fluid.layers.reduce_sum((first_weights * feat_value), 1)
#------------------------- second order term --------------------------
# ------------------------- second order term --------------------------
feat_embeddings_re = fluid.embedding(
input=feat_idx,
......@@ -100,8 +101,7 @@ class Model(ModelBase):
summed_features_emb_square - squared_sum_features_emb, 1,
keep_dim=True) # None * 1
#------------------------- DNN --------------------------
# ------------------------- DNN --------------------------
layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace)
act = envs.get_global_env("hyper_parameters.act", None, self._namespace)
......@@ -129,21 +129,21 @@ class Model(ModelBase):
initializer=fluid.initializer.TruncatedNormalInitializer(
loc=0.0, scale=init_value_)))
#------------------------- DeepFM --------------------------
# ------------------------- DeepFM --------------------------
self.predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn)
def train_net(self):
self.deepfm_net()
#------------------------- Cost(logloss) --------------------------
# ------------------------- Cost(logloss) --------------------------
cost = fluid.layers.log_loss(input=self.predict, label=self.label)
avg_cost = fluid.layers.reduce_sum(cost)
self._cost = avg_cost
#------------------------- Metric(Auc) --------------------------
# ------------------------- Metric(Auc) --------------------------
predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1)
label_int = fluid.layers.cast(self.label, 'int64')
......
......@@ -47,9 +47,6 @@ class TrainReader(Reader):
self.batch_size = envs.get_global_env("batch_size", 32, "train.reader")
self.group_size = self.batch_size * 20
def _process_line(self, line):
line = line.strip().split(';')
hist = line[0].split()
......@@ -58,13 +55,13 @@ class TrainReader(Reader):
cate = [int(i) for i in cate]
return [hist, cate, [int(line[2])], [int(line[3])], [float(line[4])]]
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
"""
def data_iter():
#feat_idx, feat_value, label = self._process_line(line)
# feat_idx, feat_value, label = self._process_line(line)
yield self._process_line(line)
return data_iter
......@@ -131,5 +128,3 @@ class TrainReader(Reader):
data_set = self.base_read(files)
random.shuffle(data_set)
return self.batch_reader(data_set, self.batch_size, self.batch_size * 20)
\ No newline at end of file
......@@ -27,8 +27,12 @@ class Model(ModelBase):
def wide_part(self, data):
out = fluid.layers.fc(input=data,
size=1,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1])),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)),
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0,
scale=1.0 / math.sqrt(
data.shape[
1])),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
act=None,
name='wide')
return out
......@@ -36,7 +40,10 @@ class Model(ModelBase):
def fc(self, data, hidden_units, active, tag):
output = fluid.layers.fc(input=data,
size=hidden_units,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))),
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0,
scale=1.0 / math.sqrt(
data.shape[
1]))),
act=active,
name=tag)
......@@ -65,13 +72,15 @@ class Model(ModelBase):
wide_model = fluid.layers.fc(input=wide_output,
size=1,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
act=None,
name='w_wide')
deep_model = fluid.layers.fc(input=deep_output,
size=1,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
act=None,
name='w_deep')
......
......@@ -30,7 +30,7 @@ class TrainReader(Reader):
line = line.strip().split(',')
features = list(map(float, line))
wide_feat = features[0:8]
deep_feat = features[8:58+8]
deep_feat = features[8:58 + 8]
label = features[-1]
return wide_feat, deep_feat, [label]
......@@ -38,6 +38,7 @@ class TrainReader(Reader):
"""
Read the data line by line and process it as a dictionary
"""
def data_iter():
wide_feat, deep_deat, label = self._process_line(line)
yield [('wide_input', wide_feat), ('deep_input', deep_deat), ('label', label)]
......
......@@ -122,8 +122,9 @@ class EvaluateReader(Reader):
else:
# Due to fixed batch_size, discard the remaining ins
return
#cur_batch = remain_data[i:]
#yield self.make_data(cur_batch, group_remain % batch_size)
# cur_batch = remain_data[i:]
# yield self.make_data(cur_batch, group_remain % batch_size)
return _reader
def generate_batch_from_trainfiles(self, files):
......@@ -134,4 +135,5 @@ class EvaluateReader(Reader):
def generate_sample(self, line):
def data_iter():
yield []
return data_iter
......@@ -29,13 +29,13 @@ class Model(ModelBase):
def init_config(self):
self._fetch_interval = 1
self.items_num, self.ins_num = self.config_read(envs.get_global_env("hyper_parameters.config_path", None, self._namespace))
self.items_num, self.ins_num = self.config_read(
envs.get_global_env("hyper_parameters.config_path", None, self._namespace))
self.train_batch_size = envs.get_global_env("batch_size", None, "train.reader")
self.evaluate_batch_size = envs.get_global_env("batch_size", None, "evaluate.reader")
self.hidden_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
self.step = envs.get_global_env("hyper_parameters.gnn_propogation_steps", None, self._namespace)
def config_read(self, config_path=None):
if config_path is None:
raise ValueError("please set train.model.hyper_parameters.config_path at first")
......@@ -48,31 +48,31 @@ class Model(ModelBase):
self.items = fluid.data(
name="items",
shape=[bs, -1],
dtype="int64") #[batch_size, uniq_max]
dtype="int64") # [batch_size, uniq_max]
self.seq_index = fluid.data(
name="seq_index",
shape=[bs, -1, 2],
dtype="int32") #[batch_size, seq_max, 2]
dtype="int32") # [batch_size, seq_max, 2]
self.last_index = fluid.data(
name="last_index",
shape=[bs, 2],
dtype="int32") #[batch_size, 2]
dtype="int32") # [batch_size, 2]
self.adj_in = fluid.data(
name="adj_in",
shape=[bs, -1, -1],
dtype="float32") #[batch_size, seq_max, seq_max]
dtype="float32") # [batch_size, seq_max, seq_max]
self.adj_out = fluid.data(
name="adj_out",
shape=[bs, -1, -1],
dtype="float32") #[batch_size, seq_max, seq_max]
dtype="float32") # [batch_size, seq_max, seq_max]
self.mask = fluid.data(
name="mask",
shape=[bs, -1, 1],
dtype="float32") #[batch_size, seq_max, 1]
dtype="float32") # [batch_size, seq_max, 1]
self.label = fluid.data(
name="label",
shape=[bs, 1],
dtype="int64") #[batch_size, 1]
dtype="int64") # [batch_size, 1]
res = [self.items, self.seq_index, self.last_index, self.adj_in, self.adj_out, self.mask, self.label]
return res
......@@ -114,7 +114,7 @@ class Model(ModelBase):
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) #[batch_size, uniq_max, h]
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
state_out = layers.fc(
input=pre_state,
name="state_out",
......@@ -124,10 +124,10 @@ class Model(ModelBase):
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) #[batch_size, uniq_max, h]
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
state_adj_in = layers.matmul(self.adj_in, state_in) #[batch_size, uniq_max, h]
state_adj_out = layers.matmul(self.adj_out, state_out) #[batch_size, uniq_max, h]
state_adj_in = layers.matmul(self.adj_in, state_in) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(self.adj_out, state_out) # [batch_size, uniq_max, h]
gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)
......@@ -155,7 +155,7 @@ class Model(ModelBase):
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) #[batch_size, seq_max, h]
low=-stdv, high=stdv))) # [batch_size, seq_max, h]
last_fc = layers.fc(
input=last,
name="last_fc",
......@@ -165,21 +165,21 @@ class Model(ModelBase):
num_flatten_dims=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) #[bathc_size, h]
low=-stdv, high=stdv))) # [bathc_size, h]
seq_fc_t = layers.transpose(
seq_fc, perm=[1, 0, 2]) #[seq_max, batch_size, h]
seq_fc, perm=[1, 0, 2]) # [seq_max, batch_size, h]
add = layers.elementwise_add(
seq_fc_t, last_fc) #[seq_max, batch_size, h]
seq_fc_t, last_fc) # [seq_max, batch_size, h]
b = layers.create_parameter(
shape=[hidden_size],
dtype='float32',
default_initializer=fluid.initializer.Constant(value=0.0)) #[h]
add = layers.elementwise_add(add, b) #[seq_max, batch_size, h]
default_initializer=fluid.initializer.Constant(value=0.0)) # [h]
add = layers.elementwise_add(add, b) # [seq_max, batch_size, h]
add_sigmoid = layers.sigmoid(add) #[seq_max, batch_size, h]
add_sigmoid = layers.sigmoid(add) # [seq_max, batch_size, h]
add_sigmoid = layers.transpose(
add_sigmoid, perm=[1, 0, 2]) #[batch_size, seq_max, h]
add_sigmoid, perm=[1, 0, 2]) # [batch_size, seq_max, h]
weight = layers.fc(
input=add_sigmoid,
......@@ -190,13 +190,13 @@ class Model(ModelBase):
bias_attr=False,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) #[batch_size, seq_max, 1]
low=-stdv, high=stdv))) # [batch_size, seq_max, 1]
weight *= self.mask
weight_mask = layers.elementwise_mul(seq, weight, axis=0) #[batch_size, seq_max, h]
global_attention = layers.reduce_sum(weight_mask, dim=1) #[batch_size, h]
weight_mask = layers.elementwise_mul(seq, weight, axis=0) # [batch_size, seq_max, h]
global_attention = layers.reduce_sum(weight_mask, dim=1) # [batch_size, h]
final_attention = layers.concat(
[global_attention, last], axis=1) #[batch_size, 2*h]
[global_attention, last], axis=1) # [batch_size, 2*h]
final_attention_fc = layers.fc(
input=final_attention,
name="final_attention_fc",
......@@ -204,7 +204,7 @@ class Model(ModelBase):
bias_attr=False,
act=None,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) #[batch_size, h]
low=-stdv, high=stdv))) # [batch_size, h]
# all_vocab = layers.create_global_var(
# shape=[items_num - 1],
......@@ -221,13 +221,13 @@ class Model(ModelBase):
name="emb",
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
size=[items_num, hidden_size]) #[all_vocab, h]
size=[items_num, hidden_size]) # [all_vocab, h]
logits = layers.matmul(
x=final_attention_fc, y=all_emb,
transpose_y=True) #[batch_size, all_vocab]
transpose_y=True) # [batch_size, all_vocab]
softmax = layers.softmax_with_cross_entropy(
logits=logits, label=self.label) #[batch_size, 1]
logits=logits, label=self.label) # [batch_size, 1]
self.loss = layers.reduce_mean(softmax) # [1]
self.acc = layers.accuracy(input=logits, label=self.label, k=20)
......
......@@ -122,8 +122,9 @@ class TrainReader(Reader):
else:
# Due to fixed batch_size, discard the remaining ins
return
#cur_batch = remain_data[i:]
#yield self.make_data(cur_batch, group_remain % batch_size)
# cur_batch = remain_data[i:]
# yield self.make_data(cur_batch, group_remain % batch_size)
return _reader
def generate_batch_from_trainfiles(self, files):
......@@ -134,4 +135,5 @@ class TrainReader(Reader):
def generate_sample(self, line):
def data_iter():
yield []
return data_iter
......@@ -86,10 +86,8 @@ class Model(ModelBase):
self._metrics["cost"] = avg_cost
self._metrics["acc"] = acc
def train_net(self):
self.all_vocab_network()
def infer_net(self):
self.all_vocab_network(is_infer=True)
......@@ -51,6 +51,7 @@ class GrnnEncoder(object):
bias_attr=self.param_name + ".bias")
return fluid.layers.sequence_pool(input=gru_h, pool_type='max')
class PairwiseHingeLoss(object):
def __init__(self, margin=0.8):
self.margin = margin
......@@ -67,6 +68,7 @@ class PairwiseHingeLoss(object):
loss_part2)
return loss_part3
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
......@@ -77,7 +79,6 @@ class Model(ModelBase):
return correct
def train(self):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace)
......@@ -126,11 +127,9 @@ class Model(ModelBase):
self._metrics["correct"] = correct
self._metrics["hinge_loss"] = hinge_loss
def train_net(self):
self.train()
def infer(self):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace)
......@@ -170,6 +169,5 @@ class Model(ModelBase):
self._infer_results['recall20'] = acc
def infer_net(self):
self.infer()
......@@ -20,12 +20,10 @@ from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class EvaluateReader(Reader):
def init(self):
self.vocab_size = envs.get_global_env("vocab_size", 10, "train.model.hyper_parameters")
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
......@@ -41,6 +39,6 @@ class EvaluateReader(Reader):
src = conv_ids[:boundary]
pos_tgt = [conv_ids[boundary]]
feature_name = ["user", "all_item", "p_item"]
yield zip(feature_name, [src] + [np.arange(self.vocab_size).astype("int64").tolist()]+ [pos_tgt])
yield zip(feature_name, [src] + [np.arange(self.vocab_size).astype("int64").tolist()] + [pos_tgt])
return reader
......@@ -19,7 +19,6 @@ import random
from paddlerec.core.reader import Reader
class TrainReader(Reader):
def init(self):
pass
......@@ -27,7 +26,6 @@ class TrainReader(Reader):
def sample_neg_from_seq(self, seq):
return seq[random.randint(0, len(seq) - 1)]
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
......
......@@ -20,11 +20,8 @@ import random
import re
import six
import argparse
prog = re.compile("[^a-z ]", flags=0)
......@@ -78,7 +75,7 @@ def parse_args():
def text_strip(text):
#English Preprocess Rule
# English Preprocess Rule
return prog.sub("", text.lower())
......@@ -120,7 +117,7 @@ def filter_corpus(args):
word_all_count = 0
id_counts = []
word_id = 0
#read dict
# read dict
with io.open(args.dict_path, 'r', encoding='utf-8') as f:
for line in f:
word, count = line.split()[0], int(line.split()[1])
......@@ -130,13 +127,13 @@ def filter_corpus(args):
id_counts.append(count)
word_all_count += count
#write word2id file
# write word2id file
print("write word2id file to : " + args.dict_path + "_word_to_id_")
with io.open(
args.dict_path + "_word_to_id_", 'w+', encoding='utf-8') as fid:
for k, v in word_to_id_.items():
fid.write(k + " " + str(v) + '\n')
#filter corpus and convert id
# filter corpus and convert id
if not os.path.exists(args.output_corpus_dir):
os.makedirs(args.output_corpus_dir)
for file in os.listdir(args.input_corpus_dir):
......@@ -205,7 +202,7 @@ def build_dict(args):
for item in item_to_remove:
unk_sum += word_count[item]
del word_count[item]
#sort by count
# sort by count
word_count[native_to_unicode('<UNK>')] = unk_sum
word_count = sorted(
word_count.items(), key=lambda word_count: -word_count[1])
......@@ -233,12 +230,13 @@ def data_split(args):
print("contents: ", str(len(contents)))
print("lines_per_file: ", str(lines_per_file))
for i in range(1, num+1):
for i in range(1, num + 1):
with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout:
data = contents[(i-1)*lines_per_file:min(i*lines_per_file,len(contents))]
data = contents[(i - 1) * lines_per_file:min(i * lines_per_file, len(contents))]
for line in data:
fout.write(line)
if __name__ == "__main__":
args = parse_args()
if args.build_dict:
......
......@@ -49,18 +49,15 @@ class EvaluateReader(Reader):
return True
return False
def _to_unicode(self, s, ignore_errors=False):
if self._is_unicode(s):
return s
error_mode = "ignore" if ignore_errors else "strict"
return s.decode("utf-8", errors=error_mode)
def strip_lines(self, line, vocab):
return self._replace_oov(vocab, self.native_to_unicode(line))
def _replace_oov(self, original_vocab, line):
"""Replace out-of-vocab words with "<UNK>".
This maintains compatibility with published results.
......@@ -78,5 +75,7 @@ class EvaluateReader(Reader):
def reader():
features = self.strip_lines(line.lower(), self.word_to_id)
features = features.split()
yield [('analogy_a', [self.word_to_id[features[0]]]), ('analogy_b', [self.word_to_id[features[1]]]), ('analogy_c', [self.word_to_id[features[2]]]), ('analogy_d', [self.word_to_id[features[3]]])]
yield [('analogy_a', [self.word_to_id[features[0]]]), ('analogy_b', [self.word_to_id[features[1]]]),
('analogy_c', [self.word_to_id[features[2]]]), ('analogy_d', [self.word_to_id[features[3]]])]
return reader
......@@ -87,7 +87,7 @@ class TrainReader(Reader):
output = [('input_word', [int(target_id)]), ('true_label', [int(context_id)])]
if not self.with_shuffle_batch:
neg_array = self.cs.searchsorted(np.random.sample(self.neg_num))
output += [('neg_label', [int(str(i)) for i in neg_array ])]
output += [('neg_label', [int(str(i)) for i in neg_array])]
yield output
return reader
return reader
......@@ -229,7 +229,7 @@ class Model(ModelBase):
act=self.act,
param_attr=fluid.ParamAttr(
name="trans.layer_fc.weight." + str(i)),
bias_attr=fluid.ParamAttr(name="trans.layer_fc.bias."+str(i)),
bias_attr=fluid.ParamAttr(name="trans.layer_fc.bias." + str(i)),
) for i in range(self.max_layers)
]
......@@ -268,8 +268,8 @@ class Model(ModelBase):
num_flatten_dims=2,
act=self.act,
param_attr=fluid.ParamAttr(
name="cls.concat_fc.weight."+str(i)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias."+str(i))
name="cls.concat_fc.weight." + str(i)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i))
) for i in range(self.max_layers)
]
......@@ -458,7 +458,7 @@ class Model(ModelBase):
param_attr=fluid.ParamAttr(
name="trans.layer_fc.weight." + str(layer_idx)),
bias_attr=fluid.ParamAttr(
name="trans.layer_fc.bias."+str(layer_idx)),
name="trans.layer_fc.bias." + str(layer_idx)),
)
return input_layer_fc_out
......@@ -479,6 +479,6 @@ class Model(ModelBase):
num_flatten_dims=2,
act=self.act,
param_attr=fluid.ParamAttr(
name="cls.concat_fc.weight."+str(layer_idx)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias."+str(layer_idx)))
name="cls.concat_fc.weight." + str(layer_idx)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(layer_idx)))
return hidden_states_fc
......@@ -28,6 +28,7 @@ class EvaluateReader(Reader):
"""
Read the data line by line and process it as a dictionary
"""
def reader():
"""
This function needs to be implemented by the user, based on data format
......
......@@ -28,6 +28,7 @@ class TrainReader(Reader):
"""
Read the data line by line and process it as a dictionary
"""
def reader():
"""
This function needs to be implemented by the user, based on data format
......
......@@ -36,7 +36,7 @@ about["__author__"] = "paddle-dev"
about["__author_email__"] = "paddle-dev@baidu.com"
about["__url__"] = "https://github.com/PaddlePaddle/PaddleRec"
readme = "..."
readme = ""
def run_cmd(command):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册