From 40c2beda6f53b8859a3d634e1075c4cb36ff1a46 Mon Sep 17 00:00:00 2001 From: malin10 Date: Mon, 1 Jun 2020 15:21:21 +0800 Subject: [PATCH] fix_code_style --- models/match/dssm/config.yaml | 1 + models/match/dssm/model.py | 25 +- models/match/dssm/readme.md | 52 ++++ .../match/dssm/synthetic_evaluate_reader.py | 2 +- models/match/multiview-simnet/model.py | 56 ++-- models/match/multiview-simnet/readme.md | 55 ++++ models/recall/fasttext/evaluate_reader.py | 16 +- models/recall/fasttext/model.py | 104 +++---- models/recall/fasttext/preprocess.py | 16 +- models/recall/fasttext/reader.py | 8 +- models/recall/gnn/evaluate_reader.py | 3 +- models/recall/gnn/model.py | 46 +-- models/recall/gnn/reader.py | 3 +- models/recall/word2vec/config.yaml | 105 ++++--- models/recall/word2vec/model.py | 265 +++++++----------- models/recall/word2vec/w2v_evaluate_reader.py | 6 +- models/recall/word2vec/w2v_reader.py | 12 +- 17 files changed, 430 insertions(+), 345 deletions(-) create mode 100644 models/match/dssm/readme.md create mode 100644 models/match/multiview-simnet/readme.md diff --git a/models/match/dssm/config.yaml b/models/match/dssm/config.yaml index 6ef08d9b..67492fb8 100755 --- a/models/match/dssm/config.yaml +++ b/models/match/dssm/config.yaml @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + workspace: "paddlerec.models.match.dssm" dataset: diff --git a/models/match/dssm/model.py b/models/match/dssm/model.py index fed0d692..23b31efc 100755 --- a/models/match/dssm/model.py +++ b/models/match/dssm/model.py @@ -27,17 +27,21 @@ class Model(ModelBase): self.Neg = envs.get_global_env("hyper_parameters.NEG") self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes") self.hidden_acts = envs.get_global_env("hyper_parameters.fc_acts") - self.learning_rate = envs.get_global_env("hyper_parameters.learning_rate") + self.learning_rate = envs.get_global_env( + "hyper_parameters.learning_rate") def input_data(self, is_infer=False, **kwargs): query = fluid.data( - name="query", shape=[-1, self.TRIGRAM_D], dtype='float32', lod_level=0) + name="query", + shape=[-1, self.TRIGRAM_D], + dtype='float32', + lod_level=0) doc_pos = fluid.data( name="doc_pos", shape=[-1, self.TRIGRAM_D], dtype='float32', lod_level=0) - + if is_infer: return [query, doc_pos] @@ -78,14 +82,14 @@ class Model(ModelBase): return R_Q_D_ns = [] - for i in range(len(inputs)-2): - doc_neg_fc_i = fc(inputs[i+2], self.hidden_layers, self.hidden_acts, [ - 'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), - 'doc_neg_l3_' + str(i) - ]) + for i in range(len(inputs) - 2): + doc_neg_fc_i = fc( + inputs[i + 2], self.hidden_layers, self.hidden_acts, [ + 'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), + 'doc_neg_l3_' + str(i) + ]) R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i)) - concat_Rs = fluid.layers.concat( - input=[R_Q_D_p] + R_Q_D_ns, axis=-1) + concat_Rs = fluid.layers.concat(input=[R_Q_D_p] + R_Q_D_ns, axis=-1) prob = fluid.layers.softmax(concat_Rs, axis=1) hit_prob = fluid.layers.slice( @@ -94,4 +98,3 @@ class Model(ModelBase): avg_cost = fluid.layers.mean(x=loss) self._cost = avg_cost self._metrics["LOSS"] = avg_cost - diff --git a/models/match/dssm/readme.md b/models/match/dssm/readme.md new file mode 100644 index 00000000..db6e43a7 --- /dev/null +++ b/models/match/dssm/readme.md @@ -0,0 +1,52 @@ +# DSSM + +## 简介 + +DSSM[《Learning Deep Structured Semantic Models for Web Search using Clickthrough Data》]( https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf )即基于深度网络的语义模型,其核心思想是将query和doc映射到共同维度的语义空间中,通过最大化query和doc语义向量之间的余弦相似度,从而训练得到隐含语义模型,达到检索的目的,并通过word hashing方法来减少输入向量的维度。DSSM有很广泛的应用,比如:搜索引擎检索,广告相关性,问答系统,机器翻译等。 + +本项目按照论文的网络结构在paddlepaddle上实现DSSM模型,并构造数据集验证网络的正确性。 + +## 模型超参 +``` +optimizer: + class: sgd # 优化器 + learning_rate: 0.01 # 学习率 + strategy: async # 参数更新方式 +TRIGRAM_D: 1000 # query和doc语义向量长度 +NEG: 4 # 负采样个数 +fc_sizes: [300, 300, 128] # fc层大小 +fc_acts: ['tanh', 'tanh', 'tanh'] # fc层激活函数 + +``` + +## 快速开始 +PaddleRec内置了demo小数据,方便用户快速使用模型,训练命令如下: +```bash +python -m paddlerec.run -m paddlerec.models.match.dssm +``` + +执行预测前,需更改config.yaml中的配置,具体改动如下: +``` +workspace: "~/code/paddlerec/models/match/dssm" # 改为当前config.yaml所在的绝对路径 + +#mode: runner1 # train +mode: runner2 # infer + +runner: +- name: runner2 + class: single_infer + init_model_path: "increment/2" # 改为需要预测的模型路径 + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataset_infer # 改成预测dataset + thread_num: 1 # dataset线程数 +``` +改完之后,执行预测命令: +``` +python -m paddlerec.run -m ./config.yaml +``` + +## 提测说明 +当前,DSSM模型采用的数据集是随机构造的,因此提测仅需按上述步骤在demo数据集上跑通即可。 diff --git a/models/match/dssm/synthetic_evaluate_reader.py b/models/match/dssm/synthetic_evaluate_reader.py index 97f50abf..5ee894fd 100755 --- a/models/match/dssm/synthetic_evaluate_reader.py +++ b/models/match/dssm/synthetic_evaluate_reader.py @@ -16,7 +16,7 @@ from __future__ import print_function from paddlerec.core.reader import Reader -class EvaluateReader(Reader): +class TrainReader(Reader): def init(self): pass diff --git a/models/match/multiview-simnet/model.py b/models/match/multiview-simnet/model.py index 6eecb0bd..608446f9 100755 --- a/models/match/multiview-simnet/model.py +++ b/models/match/multiview-simnet/model.py @@ -101,12 +101,17 @@ class Model(ModelBase): ModelBase.__init__(self, config) def _init_hyper_parameters(self): - self.query_encoder = envs.get_global_env("hyper_parameters.query_encoder") - self.title_encoder = envs.get_global_env("hyper_parameters.title_encoder") - self.query_encode_dim = envs.get_global_env("hyper_parameters.query_encode_dim") - self.title_encode_dim = envs.get_global_env("hyper_parameters.title_encode_dim") - - self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim") + self.query_encoder = envs.get_global_env( + "hyper_parameters.query_encoder") + self.title_encoder = envs.get_global_env( + "hyper_parameters.title_encoder") + self.query_encode_dim = envs.get_global_env( + "hyper_parameters.query_encode_dim") + self.title_encode_dim = envs.get_global_env( + "hyper_parameters.title_encode_dim") + + self.emb_size = envs.get_global_env( + "hyper_parameters.sparse_feature_dim") self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim") self.emb_shape = [self.emb_size, self.emb_dim] @@ -125,62 +130,61 @@ class Model(ModelBase): input=query, size=self.emb_shape, param_attr="emb") for query in self.q_slots ] - # encode each embedding field with encoder + # encode each embedding field with encoder q_encodes = [ self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) ] - # concat multi view for query, pos_title, neg_title - q_concat = fluid.layers.concat(q_encodes) + # concat multi view for query, pos_title, neg_title + q_concat = fluid.layers.concat(q_encodes) # projection of hidden layer - q_hid = fluid.layers.fc(q_concat, + q_hid = fluid.layers.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', bias_attr='q_fc.b') - self.pt_slots = self._sparse_data_var[1:2] self.title_encoders = [ factory.create(self.title_encoder, self.title_encode_dim) ] - pt_embs = [ + pt_embs = [ fluid.embedding( input=title, size=self.emb_shape, param_attr="emb") for title in self.pt_slots ] - pt_encodes = [ + pt_encodes = [ self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) ] - pt_concat = fluid.layers.concat(pt_encodes) - pt_hid = fluid.layers.fc(pt_concat, + pt_concat = fluid.layers.concat(pt_encodes) + pt_hid = fluid.layers.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b') - # cosine of hidden layers - cos_pos = fluid.layers.cos_sim(q_hid, pt_hid) + # cosine of hidden layers + cos_pos = fluid.layers.cos_sim(q_hid, pt_hid) if is_infer: - self._infer_results['query_pt_sim'] = cos_pos - return + self._infer_results['query_pt_sim'] = cos_pos + return self.nt_slots = self._sparse_data_var[2:3] - nt_embs = [ + nt_embs = [ fluid.embedding( input=title, size=self.emb_shape, param_attr="emb") for title in self.nt_slots ] - nt_encodes = [ + nt_encodes = [ self.title_encoders[i].forward(emb) for i, emb in enumerate(nt_embs) ] - nt_concat = fluid.layers.concat(nt_encodes) - nt_hid = fluid.layers.fc(nt_concat, + nt_concat = fluid.layers.concat(nt_encodes) + nt_hid = fluid.layers.fc(nt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b') - cos_neg = fluid.layers.cos_sim(q_hid, nt_hid) + cos_neg = fluid.layers.cos_sim(q_hid, nt_hid) - # pairwise hinge_loss + # pairwise hinge_loss loss_part1 = fluid.layers.elementwise_sub( tensor.fill_constant_batch_size_like( input=cos_pos, @@ -198,7 +202,7 @@ class Model(ModelBase): self._cost = fluid.layers.mean(loss_part3) self.acc = self.get_acc(cos_neg, cos_pos) - self._metrics["loss"] = self._cost + self._metrics["loss"] = self._cost self._metrics["acc"] = self.acc def get_acc(self, x, y): diff --git a/models/match/multiview-simnet/readme.md b/models/match/multiview-simnet/readme.md new file mode 100644 index 00000000..cc09c2f2 --- /dev/null +++ b/models/match/multiview-simnet/readme.md @@ -0,0 +1,55 @@ +# Multi-view Simnet for Personalized recommendation + +## 简介 + +在个性化推荐场景中,推荐系统给用户提供的项目(Item)列表通常是通过个性化的匹配模型计算出来的。在现实世界中,一个用户可能有很多个视角的特征,比如用户Id,年龄,项目的点击历史等。一个项目,举例来说,新闻资讯,也会有多种视角的特征比如新闻标题,新闻类别等。Multi-view Simnet模型是可以融合用户以及推荐项目的多个视角的特征并进行个性化匹配学习的一体化模型。这类模型在很多工业化的场景中都会被使用到,比如百度的Feed产品中。 + +本项目的目标是提供一个在个性化匹配场景下利用Paddle搭建的模型。Multi-view Simnet模型包括多个编码器模块,每个编码器被用在不同的特征视角上。当前,项目中提供Bag-of-Embedding编码器,Temporal-Convolutional编码器,和Gated-Recurrent-Unit编码器。我们会逐渐加入稀疏特征场景下比较实用的编码器到这个项目中。模型的训练方法,当前采用的是Pairwise ranking模式进行训练,即针对一对具有关联的User-Item组合,并随机产出一个Item作为负例进行排序学习。 + +## 模型超参 +``` +optimizer: + class: Adam # 优化器类型 + learning_rate: 0.0001 # 学习率 + strategy: async # 参数更新方式 +query_encoder: "bow" # 用户特征编码器 +title_encoder: "bow" # item特征编码器 +query_encode_dim: 128 # 用户编码器产出的特征维度 +title_encode_dim: 128 # item编码器产出的特征维度 +sparse_feature_dim: 1000001 # 用户特征及item特征,所有特征总个数 +embedding_dim: 128 # 特征维度 +hidden_size: 128 # 隐藏层维度 +margin: 0.1 # max margin for hinge-loss +``` + +## 快速开始 +PaddleRec内置了demo小数据,方便用户快速使用模型,训练命令如下: +```bash +python -m paddlerec.run -m paddlerec.models.match.multiview-simnet +``` + +执行预测前,需更改config.yaml中的配置,具体改动如下: +``` +workspace: "~/code/paddlerec/models/match/multiview-simnet" # 改为当前config.yaml所在的绝对路径 + +#mode: runner1 # train +mode: runner2 # infer + +runner: +- name: runner2 + class: single_infer + init_model_path: "increment/2" # 改为需要预测的模型路径 + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataset_infer # 改成预测dataset + thread_num: 1 # dataset线程数 +``` +改完之后,执行预测命令: +``` +python -m paddlerec.run -m ./config.yaml +``` + +## 提测说明 +当前,Multi-view Simnet模型采用的数据集是机器随机构造的,因此提测仅需按上述步骤在demo数据集上跑通即可。 diff --git a/models/recall/fasttext/evaluate_reader.py b/models/recall/fasttext/evaluate_reader.py index d4357ab6..d51c8606 100755 --- a/models/recall/fasttext/evaluate_reader.py +++ b/models/recall/fasttext/evaluate_reader.py @@ -22,9 +22,10 @@ from paddlerec.core.utils import envs class TrainReader(Reader): def init(self): - dict_path = envs.get_global_env("dataset.dataset_infer.word_id_dict_path") - self.min_n = envs.get_global_env("hyper_parameters.min_n") - self.max_n = envs.get_global_env("hyper_parameters.max_n") + dict_path = envs.get_global_env( + "dataset.dataset_infer.word_id_dict_path") + self.min_n = envs.get_global_env("hyper_parameters.min_n") + self.max_n = envs.get_global_env("hyper_parameters.max_n") self.word_to_id = dict() self.id_to_word = dict() with io.open(dict_path, 'r', encoding='utf-8') as f: @@ -78,7 +79,8 @@ class TrainReader(Reader): a unicode string - a space-delimited sequence of words. """ return u" ".join([ - "<" + word + ">" if "<" + word + ">" in original_vocab else u"" + "<" + word + ">" + if "<" + word + ">" in original_vocab else u"" for word in line.split() ]) @@ -99,9 +101,7 @@ class TrainReader(Reader): res.append(self.word_to_id[_]) inputs.append(res) print(inputs) - yield [('analogy_a', inputs[0]), - ('analogy_b', inputs[1]), - ('analogy_c', inputs[2]), - ('analogy_d', inputs[3][0:1])] + yield [('analogy_a', inputs[0]), ('analogy_b', inputs[1]), + ('analogy_c', inputs[2]), ('analogy_d', inputs[3][0:1])] return reader diff --git a/models/recall/fasttext/model.py b/models/recall/fasttext/model.py index 6c9fe3c1..f6415d80 100755 --- a/models/recall/fasttext/model.py +++ b/models/recall/fasttext/model.py @@ -24,27 +24,33 @@ class Model(ModelBase): ModelBase.__init__(self, config) def _init_hyper_parameters(self): - self.is_distributed = True if envs.get_trainer() == "CtrTrainer" else False - self.sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number") - self.sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim") + self.is_distributed = True if envs.get_trainer( + ) == "CtrTrainer" else False + self.sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number") + self.sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim") self.neg_num = envs.get_global_env("hyper_parameters.neg_num") - self.with_shuffle_batch = envs.get_global_env("hyper_parameters.with_shuffle_batch") - self.learning_rate = envs.get_global_env("hyper_parameters.optimizer.learning_rate") - self.decay_steps = envs.get_global_env("hyper_parameters.optimizer.decay_steps") - self.decay_rate = envs.get_global_env("hyper_parameters.optimizer.decay_rate") - + self.with_shuffle_batch = envs.get_global_env( + "hyper_parameters.with_shuffle_batch") + self.learning_rate = envs.get_global_env( + "hyper_parameters.optimizer.learning_rate") + self.decay_steps = envs.get_global_env( + "hyper_parameters.optimizer.decay_steps") + self.decay_rate = envs.get_global_env( + "hyper_parameters.optimizer.decay_rate") def input_data(self, is_infer=False, **kwargs): if is_infer: analogy_a = fluid.data( - name="analogy_a", shape=[None, 1], lod_level=1, dtype='int64') + name="analogy_a", shape=[None, 1], lod_level=1, dtype='int64') analogy_b = fluid.data( name="analogy_b", shape=[None, 1], lod_level=1, dtype='int64') analogy_c = fluid.data( name="analogy_c", shape=[None, 1], lod_level=1, dtype='int64') analogy_d = fluid.data( name="analogy_d", shape=[None, 1], dtype='int64') - return [analogy_a, analogy_b, analogy_c, analogy_d] + return [analogy_a, analogy_b, analogy_c, analogy_d] input_word = fluid.data( name="input_word", shape=[None, 1], lod_level=1, dtype='int64') @@ -59,10 +65,10 @@ class Model(ModelBase): def net(self, inputs, is_infer=False): if is_infer: - self.infer_net(inputs) - return + self.infer_net(inputs) + return - def embedding_layer(input, + def embedding_layer(input, table_name, initializer_instance=None, sequence_pool=False): @@ -74,7 +80,8 @@ class Model(ModelBase): param_attr=fluid.ParamAttr( name=table_name, initializer=initializer_instance), ) if sequence_pool: - emb = fluid.layers.sequence_pool(input=emb, pool_type='average') + emb = fluid.layers.sequence_pool( + input=emb, pool_type='average') return emb init_width = 1.0 / self.sparse_feature_dim @@ -83,10 +90,10 @@ class Model(ModelBase): input_emb = embedding_layer(inputs[0], "emb", emb_initializer, True) input_emb = fluid.layers.squeeze(input=input_emb, axes=[1]) - true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer, True) + true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer, + True) true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1]) - if self.with_shuffle_batch: neg_emb_w_list = [] for i in range(self.neg_num): @@ -95,7 +102,8 @@ class Model(ModelBase): true_emb_w)) # shuffle true_word neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0) neg_emb_w = fluid.layers.reshape( - neg_emb_w_concat, shape=[-1, self.neg_num, self.sparse_feature_dim]) + neg_emb_w_concat, + shape=[-1, self.neg_num, self.sparse_feature_dim]) else: neg_emb_w = embedding_layer(inputs[2], "emb_w", emb_w_initializer) true_logits = fluid.layers.reduce_sum( @@ -107,8 +115,7 @@ class Model(ModelBase): input_emb, shape=[-1, 1, self.sparse_feature_dim]) neg_matmul = fluid.layers.matmul( input_emb_re, neg_emb_w, transpose_y=True) - neg_logits = fluid.layers.reshape( - neg_matmul, shape=[-1, 1]) + neg_logits = fluid.layers.reshape(neg_matmul, shape=[-1, 1]) logits = fluid.layers.concat([true_logits, neg_logits], axis=0) label_ones = fluid.layers.fill_constant( @@ -120,13 +127,12 @@ class Model(ModelBase): value=0.0, dtype='float32') label = fluid.layers.concat([label_ones, label_zeros], axis=0) - + loss = fluid.layers.log_loss(fluid.layers.sigmoid(logits), label) avg_cost = fluid.layers.reduce_sum(loss) self._cost = avg_cost self._metrics["LOSS"] = avg_cost - def optimizer(self): optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( @@ -137,13 +143,17 @@ class Model(ModelBase): return optimizer def infer_net(self, inputs): - def embedding_layer(input, table_name, initializer_instance=None, sequence_pool=False): + def embedding_layer(input, + table_name, + initializer_instance=None, + sequence_pool=False): emb = fluid.embedding( input=input, size=[self.sparse_feature_number, self.sparse_feature_dim], param_attr=table_name) if sequence_pool: - emb = fluid.layers.sequence_pool(input=emb, pool_type='average') + emb = fluid.layers.sequence_pool( + input=emb, pool_type='average') return emb all_label = np.arange(self.sparse_feature_number).reshape( @@ -166,36 +176,34 @@ class Model(ModelBase): dist = fluid.layers.matmul( x=target, y=emb_all_label_l2, transpose_y=True) values, pred_idx = fluid.layers.topk(input=dist, k=4) - label = fluid.layers.expand( - inputs[3], - expand_times=[1, 4]) + label = fluid.layers.expand(inputs[3], expand_times=[1, 4]) label_ones = fluid.layers.fill_constant_batch_size_like( label, shape=[-1, 1], value=1.0, dtype='float32') right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast( fluid.layers.equal(pred_idx, label), dtype='float32')) total_cnt = fluid.layers.reduce_sum(label_ones) - # global_right_cnt = fluid.layers.create_global_var( - # name="global_right_cnt", - # persistable=True, - # dtype='float32', - # shape=[1], - # value=0) - # global_total_cnt = fluid.layers.create_global_var( - # name="global_total_cnt", - # persistable=True, - # dtype='float32', - # shape=[1], - # value=0) - # global_right_cnt.stop_gradient = True - # global_total_cnt.stop_gradient = True - - # tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt) - # fluid.layers.assign(tmp1, global_right_cnt) - # tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt) - # fluid.layers.assign(tmp2, global_total_cnt) - - # acc = fluid.layers.elementwise_div( - # global_right_cnt, global_total_cnt, name="total_acc") + # global_right_cnt = fluid.layers.create_global_var( + # name="global_right_cnt", + # persistable=True, + # dtype='float32', + # shape=[1], + # value=0) + # global_total_cnt = fluid.layers.create_global_var( + # name="global_total_cnt", + # persistable=True, + # dtype='float32', + # shape=[1], + # value=0) + # global_right_cnt.stop_gradient = True + # global_total_cnt.stop_gradient = True + + # tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt) + # fluid.layers.assign(tmp1, global_right_cnt) + # tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt) + # fluid.layers.assign(tmp2, global_total_cnt) + + # acc = fluid.layers.elementwise_div( + # global_right_cnt, global_total_cnt, name="total_acc") acc = fluid.layers.elementwise_div(right_cnt, total_cnt, name="acc") self._infer_results['acc'] = acc diff --git a/models/recall/fasttext/preprocess.py b/models/recall/fasttext/preprocess.py index 43d80971..95e48836 100755 --- a/models/recall/fasttext/preprocess.py +++ b/models/recall/fasttext/preprocess.py @@ -45,18 +45,8 @@ def parse_args(): default=5, help="If the word count is less then min_count, it will be removed from dict" ) - parser.add_argument( - '--min_n', - type=int, - default=3, - help="min_n of ngrams" - ) - parser.add_argument( - '--max_n', - type=int, - default=5, - help="max_n of ngrams" - ) + parser.add_argument('--min_n', type=int, default=3, help="min_n of ngrams") + parser.add_argument('--max_n', type=int, default=5, help="max_n of ngrams") parser.add_argument( '--file_nums', type=int, @@ -201,6 +191,7 @@ def computeSubwords(word, min_n, max_n): ngrams.add("".join(word[i:end])) return list(ngrams) + def build_dict(args): """ proprocess the data, generate dictionary and save into dict_path. @@ -267,6 +258,7 @@ def build_dict(args): f.write(" ".join(word_ngrams[key])) f.write(u'\n') + def data_split(args): raw_data_dir = args.input_corpus_dir new_data_dir = args.output_corpus_dir diff --git a/models/recall/fasttext/reader.py b/models/recall/fasttext/reader.py index a804ef90..a46e7a01 100755 --- a/models/recall/fasttext/reader.py +++ b/models/recall/fasttext/reader.py @@ -40,8 +40,10 @@ class NumpyRandomInt(object): class TrainReader(Reader): def init(self): - dict_path = envs.get_global_env("dataset.dataset_train.word_count_dict_path") - word_ngrams_path = envs.get_global_env("dataset.dataset_train.word_ngrams_path") + dict_path = envs.get_global_env( + "dataset.dataset_train.word_count_dict_path") + word_ngrams_path = envs.get_global_env( + "dataset.dataset_train.word_ngrams_path") self.window_size = envs.get_global_env("hyper_parameters.window_size") self.neg_num = envs.get_global_env("hyper_parameters.neg_num") self.with_shuffle_batch = envs.get_global_env( @@ -53,7 +55,7 @@ class TrainReader(Reader): for line in f: line = line.rstrip().split() self.word_ngrams[str(line[0])] = map(int, line[1:]) - + self.cs = None if not self.with_shuffle_batch: id_counts = [] diff --git a/models/recall/gnn/evaluate_reader.py b/models/recall/gnn/evaluate_reader.py index 74299972..d7b24c96 100755 --- a/models/recall/gnn/evaluate_reader.py +++ b/models/recall/gnn/evaluate_reader.py @@ -23,7 +23,8 @@ from paddlerec.core.utils import envs class TrainReader(Reader): def init(self): - self.batch_size = envs.get_global_env("dataset.dataset_infer.batch_size") + self.batch_size = envs.get_global_env( + "dataset.dataset_infer.batch_size") self.input = [] self.length = None diff --git a/models/recall/gnn/model.py b/models/recall/gnn/model.py index 055cea3a..6e7d2ab5 100755 --- a/models/recall/gnn/model.py +++ b/models/recall/gnn/model.py @@ -27,19 +27,27 @@ class Model(ModelBase): ModelBase.__init__(self, config) def _init_hyper_parameters(self): - self.learning_rate = envs.get_global_env("hyper_parameters.optimizer.learning_rate") - self.decay_steps = envs.get_global_env("hyper_parameters.optimizer.decay_steps") - self.decay_rate = envs.get_global_env("hyper_parameters.optimizer.decay_rate") - self.l2 = envs.get_global_env("hyper_parameters.optimizer.l2") - - self.dict_size = envs.get_global_env("hyper_parameters.sparse_feature_nums") + self.learning_rate = envs.get_global_env( + "hyper_parameters.optimizer.learning_rate") + self.decay_steps = envs.get_global_env( + "hyper_parameters.optimizer.decay_steps") + self.decay_rate = envs.get_global_env( + "hyper_parameters.optimizer.decay_rate") + self.l2 = envs.get_global_env("hyper_parameters.optimizer.l2") + + self.dict_size = envs.get_global_env( + "hyper_parameters.sparse_feature_nums") self.corpus_size = envs.get_global_env("hyper_parameters.corpus_size") - self.train_batch_size = envs.get_global_env("dataset.dataset_train.batch_size") - self.evaluate_batch_size = envs.get_global_env("dataset.dataset_infer.batch_size") + self.train_batch_size = envs.get_global_env( + "dataset.dataset_train.batch_size") + self.evaluate_batch_size = envs.get_global_env( + "dataset.dataset_infer.batch_size") - self.hidden_size = envs.get_global_env("hyper_parameters.sparse_feature_dim") - self.step = envs.get_global_env("hyper_parameters.gnn_propogation_steps") + self.hidden_size = envs.get_global_env( + "hyper_parameters.sparse_feature_dim") + self.step = envs.get_global_env( + "hyper_parameters.gnn_propogation_steps") def input_data(self, is_infer=False, **kwargs): if is_infer: @@ -66,9 +74,7 @@ class Model(ModelBase): label = fluid.data( name="label", shape=[bs, 1], dtype="int64") # [batch_size, 1] - res = [ - items, seq_index, last_index, adj_in, adj_out, mask, label - ] + res = [items, seq_index, last_index, adj_in, adj_out, mask, label] return res def net(self, inputs, is_infer=False): @@ -76,7 +82,7 @@ class Model(ModelBase): bs = self.evaluate_batch_size else: bs = self.train_batch_size - + stdv = 1.0 / math.sqrt(self.hidden_size) def embedding_layer(input, @@ -124,7 +130,8 @@ class Model(ModelBase): state_adj_in = layers.matmul(inputs[3], state_in) # [batch_size, uniq_max, h] - state_adj_out = layers.matmul(inputs[4], state_out) # [batch_size, uniq_max, h] + state_adj_out = layers.matmul( + inputs[4], state_out) # [batch_size, uniq_max, h] gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) @@ -140,7 +147,8 @@ class Model(ModelBase): x=pre_state, shape=[-1, self.hidden_size]), size=3 * self.hidden_size) - final_state = layers.reshape(pre_state, shape=[bs, -1, self.hidden_size]) + final_state = layers.reshape( + pre_state, shape=[bs, -1, self.hidden_size]) seq = layers.gather_nd(final_state, inputs[1]) last = layers.gather_nd(final_state, inputs[2]) @@ -231,14 +239,13 @@ class Model(ModelBase): self._cost = self.loss if is_infer: - self._infer_results['acc'] = self.acc + self._infer_results['acc'] = self.acc self._infer_results['loss'] = self.loss return - self._metrics["LOSS"] = self.loss + self._metrics["LOSS"] = self.loss self._metrics["train_acc"] = self.acc - def optimizer(self): step_per_epoch = self.corpus_size // self.train_batch_size optimizer = fluid.optimizer.Adam( @@ -249,4 +256,3 @@ class Model(ModelBase): regularization=fluid.regularizer.L2DecayRegularizer( regularization_coeff=self.l2)) return optimizer - diff --git a/models/recall/gnn/reader.py b/models/recall/gnn/reader.py index 7dfade76..fd54277d 100755 --- a/models/recall/gnn/reader.py +++ b/models/recall/gnn/reader.py @@ -23,7 +23,8 @@ from paddlerec.core.utils import envs class TrainReader(Reader): def init(self): - self.batch_size = envs.get_global_env("dataset.dataset_train.batch_size") + self.batch_size = envs.get_global_env( + "dataset.dataset_train.batch_size") self.input = [] self.length = None diff --git a/models/recall/word2vec/config.yaml b/models/recall/word2vec/config.yaml index 9bb5c4d3..7a1452a7 100755 --- a/models/recall/word2vec/config.yaml +++ b/models/recall/word2vec/config.yaml @@ -11,51 +11,70 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -evaluate: - workspace: "paddlerec.models.recall.word2vec" +workspace: "paddlerec.models.recall.word2vec" - evaluate_only: False - evaluate_model_path: "" - - reader: - batch_size: 50 - class: "{workspace}/w2v_evaluate_reader.py" - test_data_path: "{workspace}/data/test" - word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt" +# list of dataset +dataset: +- name: dataset_train # name of dataset to distinguish different datasets + batch_size: 100 + type: DataLoader # or QueueDataset + data_path: "{workspace}/data/train" + word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt" + data_converter: "{workspace}/w2v_reader.py" +- name: dataset_infer # name + batch_size: 50 + type: DataLoader # or QueueDataset + data_path: "{workspace}/data/test" + word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt" + data_converter: "{workspace}/w2v_evaluate_reader.py" -train: - trainer: - # for cluster training - strategy: "async" +hyper_parameters: + optimizer: + learning_rate: 1.0 + decay_steps: 100000 + decay_rate: 0.999 + class: sgd + strategy: async + sparse_feature_number: 85 + sparse_feature_dim: 300 + with_shuffle_batch: False + neg_num: 5 + window_size: 5 +# select runner by name +mode: runner1 +# config of each runner. +# runner is a kind of paddle training class, which wraps the train/infer process. +runner: +- name: runner1 + class: single_train + # num of epochs epochs: 2 - workspace: "paddlerec.models.recall.word2vec" + # device to run training or infer + device: cpu + save_checkpoint_interval: 1 # save model interval of epochs + save_inference_interval: 1 # save inference + save_checkpoint_path: "increment" # save checkpoint path + save_inference_path: "inference" # save inference path + save_inference_feed_varnames: [] # feed vars of save inference + save_inference_fetch_varnames: [] # fetch vars of save inference + init_model_path: "" # load model path + fetch_period: 10 +- name: runner2 + class: single_infer + # num of epochs + epochs: 1 + # device to run training or infer + device: cpu + init_model_path: "increment/0" # load model path - reader: - batch_size: 100 - class: "{workspace}/w2v_reader.py" - train_data_path: "{workspace}/data/train" - word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt" - - model: - models: "{workspace}/model.py" - hyper_parameters: - sparse_feature_number: 85 - sparse_feature_dim: 300 - with_shuffle_batch: False - neg_num: 5 - window_size: 5 - learning_rate: 1.0 - decay_steps: 100000 - decay_rate: 0.999 - optimizer: sgd - - save: - increment: - dirname: "increment" - epoch_interval: 1 - save_last: True - inference: - dirname: "inference" - epoch_interval: 1 - save_last: True +# runner will run all the phase in each epoch +phase: +- name: phase1 + model: "{workspace}/model.py" # user-defined model + dataset_name: dataset_train # select dataset by name + thread_num: 1 +#- name: phase2 +# model: "{workspace}/model.py" # user-defined model +# dataset_name: dataset_infer # select dataset by name +# thread_num: 1 diff --git a/models/recall/word2vec/model.py b/models/recall/word2vec/model.py index fefc8904..16dc94f4 100755 --- a/models/recall/word2vec/model.py +++ b/models/recall/word2vec/model.py @@ -23,56 +23,60 @@ class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) - def input(self): - neg_num = int( - envs.get_global_env("hyper_parameters.neg_num", None, - self._namespace)) - self.input_word = fluid.data( + def _init_hyper_parameters(self): + self.is_distributed = True if envs.get_trainer( + ) == "CtrTrainer" else False + self.sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number") + self.sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim") + self.neg_num = envs.get_global_env("hyper_parameters.neg_num") + self.with_shuffle_batch = envs.get_global_env( + "hyper_parameters.with_shuffle_batch") + self.learning_rate = envs.get_global_env( + "hyper_parameters.optimizer.learning_rate") + self.decay_steps = envs.get_global_env( + "hyper_parameters.optimizer.decay_steps") + self.decay_rate = envs.get_global_env( + "hyper_parameters.optimizer.decay_rate") + + def input_data(self, is_infer=False, **kwargs): + if is_infer: + analogy_a = fluid.data( + name="analogy_a", shape=[None], dtype='int64') + analogy_b = fluid.data( + name="analogy_b", shape=[None], dtype='int64') + analogy_c = fluid.data( + name="analogy_c", shape=[None], dtype='int64') + analogy_d = fluid.data( + name="analogy_d", shape=[None], dtype='int64') + return [analogy_a, analogy_b, analogy_c, analogy_d] + + input_word = fluid.data( name="input_word", shape=[None, 1], dtype='int64') - self.true_word = fluid.data( + true_word = fluid.data( name='true_label', shape=[None, 1], dtype='int64') - self._data_var.append(self.input_word) - self._data_var.append(self.true_word) - with_shuffle_batch = bool( - int( - envs.get_global_env("hyper_parameters.with_shuffle_batch", - None, self._namespace))) - if not with_shuffle_batch: - self.neg_word = fluid.data( - name="neg_label", shape=[None, neg_num], dtype='int64') - self._data_var.append(self.neg_word) + if self.with_shuffle_batch: + return [input_word, true_word] - if self._platform != "LINUX": - self._data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._data_var, - capacity=64, - use_double_buffer=False, - iterable=False) + neg_word = fluid.data( + name="neg_label", shape=[None, self.neg_num], dtype='int64') + return [input_word, true_word, neg_word] - def net(self): - is_distributed = True if envs.get_trainer() == "CtrTrainer" else False - neg_num = int( - envs.get_global_env("hyper_parameters.neg_num", None, - self._namespace)) - sparse_feature_number = envs.get_global_env( - "hyper_parameters.sparse_feature_number", None, self._namespace) - sparse_feature_dim = envs.get_global_env( - "hyper_parameters.sparse_feature_dim", None, self._namespace) - with_shuffle_batch = bool( - int( - envs.get_global_env("hyper_parameters.with_shuffle_batch", - None, self._namespace))) + def net(self, inputs, is_infer=False): + if is_infer: + self.infer_net(inputs) + return def embedding_layer(input, table_name, - emb_dim, initializer_instance=None, squeeze=False): emb = fluid.embedding( input=input, is_sparse=True, - is_distributed=is_distributed, - size=[sparse_feature_number, emb_dim], + is_distributed=self.is_distributed, + size=[self.sparse_feature_number, self.sparse_feature_dim], param_attr=fluid.ParamAttr( name=table_name, initializer=initializer_instance), ) if squeeze: @@ -80,115 +84,60 @@ class Model(ModelBase): else: return emb - init_width = 0.5 / sparse_feature_dim + init_width = 1.0 / self.sparse_feature_dim emb_initializer = fluid.initializer.Uniform(-init_width, init_width) emb_w_initializer = fluid.initializer.Constant(value=0.0) - input_emb = embedding_layer(self.input_word, "emb", sparse_feature_dim, - emb_initializer, True) - true_emb_w = embedding_layer(self.true_word, "emb_w", - sparse_feature_dim, emb_w_initializer, + input_emb = embedding_layer(inputs[0], "emb", emb_initializer, True) + true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer, True) - true_emb_b = embedding_layer(self.true_word, "emb_b", 1, - emb_w_initializer, True) - if with_shuffle_batch: + if self.with_shuffle_batch: neg_emb_w_list = [] - for i in range(neg_num): + for i in range(self.neg_num): neg_emb_w_list.append( fluid.contrib.layers.shuffle_batch( true_emb_w)) # shuffle true_word neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0) neg_emb_w = fluid.layers.reshape( - neg_emb_w_concat, shape=[-1, neg_num, sparse_feature_dim]) - - neg_emb_b_list = [] - for i in range(neg_num): - neg_emb_b_list.append( - fluid.contrib.layers.shuffle_batch( - true_emb_b)) # shuffle true_word - neg_emb_b = fluid.layers.concat(neg_emb_b_list, axis=0) - neg_emb_b_vec = fluid.layers.reshape( - neg_emb_b, shape=[-1, neg_num]) - + neg_emb_w_concat, + shape=[-1, self.neg_num, self.sparse_feature_dim]) else: - neg_emb_w = embedding_layer(self.neg_word, "emb_w", - sparse_feature_dim, emb_w_initializer) - neg_emb_b = embedding_layer(self.neg_word, "emb_b", 1, - emb_w_initializer) - neg_emb_b_vec = fluid.layers.reshape( - neg_emb_b, shape=[-1, neg_num]) - - true_logits = fluid.layers.elementwise_add( - fluid.layers.reduce_sum( - fluid.layers.elementwise_mul(input_emb, true_emb_w), - dim=1, - keep_dim=True), - true_emb_b) + neg_emb_w = embedding_layer(inputs[2], "emb_w", emb_w_initializer) + true_logits = fluid.layers.reduce_sum( + fluid.layers.elementwise_mul(input_emb, true_emb_w), + dim=1, + keep_dim=True) input_emb_re = fluid.layers.reshape( - input_emb, shape=[-1, 1, sparse_feature_dim]) + input_emb, shape=[-1, 1, self.sparse_feature_dim]) neg_matmul = fluid.layers.matmul( input_emb_re, neg_emb_w, transpose_y=True) - neg_logits = fluid.layers.elementwise_add( - fluid.layers.reshape( - neg_matmul, shape=[-1, neg_num]), - neg_emb_b_vec) - - label_ones = fluid.layers.fill_constant_batch_size_like( - true_logits, shape=[-1, 1], value=1.0, dtype='float32') - label_zeros = fluid.layers.fill_constant_batch_size_like( - true_logits, shape=[-1, neg_num], value=0.0, dtype='float32') - - true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits, - label_ones) - neg_xent = fluid.layers.sigmoid_cross_entropy_with_logits(neg_logits, - label_zeros) - cost = fluid.layers.elementwise_add( - fluid.layers.reduce_sum( - true_xent, dim=1), - fluid.layers.reduce_sum( - neg_xent, dim=1)) - self.avg_cost = fluid.layers.reduce_mean(cost) - global_right_cnt = fluid.layers.create_global_var( - name="global_right_cnt", - persistable=True, - dtype='float32', - shape=[1], - value=0) - global_total_cnt = fluid.layers.create_global_var( - name="global_total_cnt", - persistable=True, - dtype='float32', - shape=[1], - value=0) - global_right_cnt.stop_gradient = True - global_total_cnt.stop_gradient = True + neg_logits = fluid.layers.reshape(neg_matmul, shape=[-1, 1]) - def avg_loss(self): - self._cost = self.avg_cost + logits = fluid.layers.concat([true_logits, neg_logits], axis=0) + label_ones = fluid.layers.fill_constant( + shape=[fluid.layers.shape(true_logits)[0], 1], + value=1.0, + dtype='float32') + label_zeros = fluid.layers.fill_constant( + shape=[fluid.layers.shape(neg_logits)[0], 1], + value=0.0, + dtype='float32') + label = fluid.layers.concat([label_ones, label_zeros], axis=0) - def metrics(self): - self._metrics["LOSS"] = self.avg_cost + loss = fluid.layers.log_loss(fluid.layers.sigmoid(logits), label) + avg_cost = fluid.layers.reduce_sum(loss) - def train_net(self): - self.input() - self.net() - self.avg_loss() - self.metrics() + self._cost = avg_cost + self._metrics["LOSS"] = avg_cost def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", - None, self._namespace) - decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None, - self._namespace) - decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None, - self._namespace) optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( - learning_rate=learning_rate, - decay_steps=decay_steps, - decay_rate=decay_rate, + learning_rate=self.learning_rate, + decay_steps=self.decay_steps, + decay_rate=self.decay_rate, staircase=True)) return optimizer @@ -213,28 +162,22 @@ class Model(ModelBase): use_double_buffer=False, iterable=False) - def infer_net(self): - sparse_feature_dim = envs.get_global_env( - "hyper_parameters.sparse_feature_dim", None, self._namespace) - sparse_feature_number = envs.get_global_env( - "hyper_parameters.sparse_feature_number", None, self._namespace) - + def infer_net(self, inputs): def embedding_layer(input, table_name, initializer_instance=None): emb = fluid.embedding( input=input, - size=[sparse_feature_number, sparse_feature_dim], + size=[self.sparse_feature_number, self.sparse_feature_dim], param_attr=table_name) return emb - self.analogy_input() - all_label = np.arange(sparse_feature_number).reshape( - sparse_feature_number).astype('int32') + all_label = np.arange(self.sparse_feature_number).reshape( + self.sparse_feature_number).astype('int32') self.all_label = fluid.layers.cast( x=fluid.layers.assign(all_label), dtype='int64') emb_all_label = embedding_layer(self.all_label, "emb") - emb_a = embedding_layer(self.analogy_a, "emb") - emb_b = embedding_layer(self.analogy_b, "emb") - emb_c = embedding_layer(self.analogy_c, "emb") + emb_a = embedding_layer(inputs[0], "emb") + emb_b = embedding_layer(inputs[1], "emb") + emb_c = embedding_layer(inputs[2], "emb") target = fluid.layers.elementwise_add( fluid.layers.elementwise_sub(emb_b, emb_a), emb_c) @@ -245,34 +188,34 @@ class Model(ModelBase): values, pred_idx = fluid.layers.topk(input=dist, k=4) label = fluid.layers.expand( fluid.layers.unsqueeze( - self.analogy_d, axes=[1]), - expand_times=[1, 4]) + inputs[3], axes=[1]), expand_times=[1, 4]) label_ones = fluid.layers.fill_constant_batch_size_like( label, shape=[-1, 1], value=1.0, dtype='float32') right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast( fluid.layers.equal(pred_idx, label), dtype='float32')) total_cnt = fluid.layers.reduce_sum(label_ones) - global_right_cnt = fluid.layers.create_global_var( - name="global_right_cnt", - persistable=True, - dtype='float32', - shape=[1], - value=0) - global_total_cnt = fluid.layers.create_global_var( - name="global_total_cnt", - persistable=True, - dtype='float32', - shape=[1], - value=0) - global_right_cnt.stop_gradient = True - global_total_cnt.stop_gradient = True - - tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt) - fluid.layers.assign(tmp1, global_right_cnt) - tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt) - fluid.layers.assign(tmp2, global_total_cnt) - - acc = fluid.layers.elementwise_div( - global_right_cnt, global_total_cnt, name="total_acc") + # global_right_cnt = fluid.layers.create_global_var( + # name="global_right_cnt", + # persistable=True, + # dtype='float32', + # shape=[1], + # value=0) + # global_total_cnt = fluid.layers.create_global_var( + # name="global_total_cnt", + # persistable=True, + # dtype='float32', + # shape=[1], + # value=0) + # global_right_cnt.stop_gradient = True + # global_total_cnt.stop_gradient = True + + # tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt) + # fluid.layers.assign(tmp1, global_right_cnt) + # tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt) + # fluid.layers.assign(tmp2, global_total_cnt) + + # acc = fluid.layers.elementwise_div( + # global_right_cnt, global_total_cnt, name="total_acc") + acc = fluid.layers.elementwise_div(right_cnt, total_cnt, name="acc") self._infer_results['acc'] = acc diff --git a/models/recall/word2vec/w2v_evaluate_reader.py b/models/recall/word2vec/w2v_evaluate_reader.py index 6350c960..8805b9aa 100755 --- a/models/recall/word2vec/w2v_evaluate_reader.py +++ b/models/recall/word2vec/w2v_evaluate_reader.py @@ -20,10 +20,10 @@ from paddlerec.core.reader import Reader from paddlerec.core.utils import envs -class EvaluateReader(Reader): +class TrainReader(Reader): def init(self): - dict_path = envs.get_global_env("word_id_dict_path", None, - "evaluate.reader") + dict_path = envs.get_global_env( + "dataset.dataset_infer.word_id_dict_path") self.word_to_id = dict() self.id_to_word = dict() with io.open(dict_path, 'r', encoding='utf-8') as f: diff --git a/models/recall/word2vec/w2v_reader.py b/models/recall/word2vec/w2v_reader.py index 9b3e6912..15bbd9b0 100755 --- a/models/recall/word2vec/w2v_reader.py +++ b/models/recall/word2vec/w2v_reader.py @@ -40,14 +40,12 @@ class NumpyRandomInt(object): class TrainReader(Reader): def init(self): - dict_path = envs.get_global_env("word_count_dict_path", None, - "train.reader") - self.window_size = envs.get_global_env("hyper_parameters.window_size", - None, "train.model") - self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None, - "train.model") + dict_path = envs.get_global_env( + "dataset.dataset_train.word_count_dict_path") + self.window_size = envs.get_global_env("hyper_parameters.window_size") + self.neg_num = envs.get_global_env("hyper_parameters.neg_num") self.with_shuffle_batch = envs.get_global_env( - "hyper_parameters.with_shuffle_batch", None, "train.model") + "hyper_parameters.with_shuffle_batch") self.random_generator = NumpyRandomInt(1, self.window_size + 1) self.cs = None -- GitLab