diff --git a/models/rank/dataset/Criteo_data/get_slot_data.py b/models/rank/dataset/Criteo_data/get_slot_data.py index f090c666de3709736eda701c7447e58cd9bec2a0..b9a4adc3feff53499f4ff190d4a321bc58cf2033 100644 --- a/models/rank/dataset/Criteo_data/get_slot_data.py +++ b/models/rank/dataset/Criteo_data/get_slot_data.py @@ -78,6 +78,7 @@ class Reader(dg.MultiSlotDataGenerator): v = i[1] for j in v: s += " " + k + ":" + str(j) + print(s.strip()) yield None return data_iter diff --git a/models/rank/dcn/data/get_slot_data.py b/models/rank/dcn/data/get_slot_data.py index 8df43176fe2214bc66e87301ec829f7e33c460a2..bd3d473bcda86fa6379c10eff889b319ccb9eb56 100755 --- a/models/rank/dcn/data/get_slot_data.py +++ b/models/rank/dcn/data/get_slot_data.py @@ -92,7 +92,7 @@ class Reader(dg.MultiSlotDataGenerator): v = i[1] for j in v: s += " " + k + ":" + str(j) - print s.strip() + print(s.strip()) yield None return data_iter diff --git a/models/rank/deepfm/data/get_slot_data.py b/models/rank/deepfm/data/get_slot_data.py index 920fff7911c6a95d7ac5505849a41283c9042acb..5ee0fcebc1f14891c81ce7c228e699f6c533d2a1 100755 --- a/models/rank/deepfm/data/get_slot_data.py +++ b/models/rank/deepfm/data/get_slot_data.py @@ -79,7 +79,7 @@ class Reader(dg.MultiSlotDataGenerator): v = i[1] for j in v: s += " " + k + ":" + str(j) - print s.strip() + print(s.strip()) yield None return data_iter diff --git a/models/rank/dnn/README.md b/models/rank/dnn/README.md index c307efcc255bc3d087de0cbcb2aaae39e65ce2f2..9656adc655d6e2d8931861a492af3583906f98f5 100644 --- a/models/rank/dnn/README.md +++ b/models/rank/dnn/README.md @@ -185,7 +185,7 @@ inputs = [dense_input] + sparse_input_ids + [label] ### CTR-DNN模型组网 -CTR-DNN模型的组网比较直观,本质是一个二分类任务,代码参考`network_conf.py`。模型主要组成是一个`Embedding`层,三个`FC`层,以及相应的分类任务的loss计算和auc计算。 +CTR-DNN模型的组网比较直观,本质是一个二分类任务,代码参考`model.py`。模型主要组成是一个`Embedding`层,三个`FC`层,以及相应的分类任务的loss计算和auc计算。 #### Embedding层 首先介绍Embedding层的搭建方式:`Embedding`层的输入是`sparse_input`,shape由超参的`sparse_feature_dim`和`embedding_size`定义。需要特别解释的是`is_sparse`参数,当我们指定`is_sprase=True`后,计算图会将该参数视为稀疏参数,反向更新以及分布式通信时,都以稀疏的方式进行,会极大的提升运行效率,同时保证效果一致。 @@ -235,7 +235,7 @@ fc3 = fluid.layers.fc( ) ``` #### Loss及Auc计算 -- 预测的结果通过一个输出shape为2的FC层给出,该FC层的激活函数时softmax,会给出每条样本分属于正负样本的概率。 +- 预测的结果通过一个输出shape为2的FC层给出,该FC层的激活函数是softmax,会给出每条样本分属于正负样本的概率。 - 每条样本的损失由交叉熵给出,交叉熵的输入维度为[batch_size,2],数据类型为float,label的输入维度为[batch_size,1],数据类型为int。 - 该batch的损失`avg_cost`是各条样本的损失之和 - 我们同时还会计算预测的auc,auc的结果由`fluid.layers.auc()`给出,该层的返回值有三个,分别是全局auc: `auc_var`,当前batch的auc: `batch_auc_var`,以及auc_states: `auc_states`,auc_states包含了`batch_stat_pos, batch_stat_neg, stat_pos, stat_neg`信息。`batch_auc`我们取近20个batch的平均,由参数`slide_steps=20`指定,roc曲线的离散化的临界数值设置为4096,由`num_thresholds=2**12`指定。 diff --git a/models/rank/dnn/data/get_slot_data.py b/models/rank/dnn/data/get_slot_data.py index f52447d06c297335685a704f688d71aa871328bc..cacdc279bd9877bfe974e6f093ba912972bee876 100755 --- a/models/rank/dnn/data/get_slot_data.py +++ b/models/rank/dnn/data/get_slot_data.py @@ -61,7 +61,7 @@ class CriteoDataset(dg.MultiSlotDataGenerator): s += " dense_feature:" + str(i) for i in range(1, 1 + len(categorical_range_)): s += " " + str(i) + ":" + str(sparse_feature[i - 1][0]) - print s.strip() + print(s.strip()) yield None return reader diff --git a/models/rank/logistic_regression/data/get_slot_data.py b/models/rank/logistic_regression/data/get_slot_data.py index ea1a96ef778e219de89b99b536c7473419bd26f5..f0b95a79c70d7c9da8b327884c17206e6b76f731 100644 --- a/models/rank/logistic_regression/data/get_slot_data.py +++ b/models/rank/logistic_regression/data/get_slot_data.py @@ -79,6 +79,7 @@ class Reader(dg.MultiSlotDataGenerator): v = i[1] for j in v: s += " " + k + ":" + str(j) + print(s.strip()) yield None return data_iter diff --git a/models/rank/nfm/data/get_slot_data.py b/models/rank/nfm/data/get_slot_data.py index f090c666de3709736eda701c7447e58cd9bec2a0..b9a4adc3feff53499f4ff190d4a321bc58cf2033 100644 --- a/models/rank/nfm/data/get_slot_data.py +++ b/models/rank/nfm/data/get_slot_data.py @@ -78,6 +78,7 @@ class Reader(dg.MultiSlotDataGenerator): v = i[1] for j in v: s += " " + k + ":" + str(j) + print(s.strip()) yield None return data_iter diff --git a/models/rank/wide_deep/data/get_slot_data.py b/models/rank/wide_deep/data/get_slot_data.py index 91411e3b6140506ca5a45194bc81168bc7c5a67f..5f873a3085a763df30aa3657aaec432b02e39ec8 100755 --- a/models/rank/wide_deep/data/get_slot_data.py +++ b/models/rank/wide_deep/data/get_slot_data.py @@ -50,6 +50,7 @@ class Reader(dg.MultiSlotDataGenerator): v = i[1] for j in v: s += " " + k + ":" + str(j) + print(s.strip()) yield None return data_iter diff --git a/models/rank/xdeepfm/data/get_slot_data.py b/models/rank/xdeepfm/data/get_slot_data.py index feef80507e81114745d00948aaede5fc3c7f51d8..dab44c85bce2875b867cd1c008832413535f46d5 100755 --- a/models/rank/xdeepfm/data/get_slot_data.py +++ b/models/rank/xdeepfm/data/get_slot_data.py @@ -49,7 +49,7 @@ class Reader(dg.MultiSlotDataGenerator): v = i[1] for j in v: s += " " + k + ":" + str(j) - print s.strip() + print(s.strip()) yield None return data_iter diff --git a/models/recall/gnn/evaluate_reader.py b/models/recall/gnn/evaluate_reader.py index 5b548fc3532ccc40dc575be5331e6dc98a77e64b..864d0047d325fc230c5ae10fd1df939375468d94 100755 --- a/models/recall/gnn/evaluate_reader.py +++ b/models/recall/gnn/evaluate_reader.py @@ -95,7 +95,8 @@ class Reader(ReaderBase): (batch_size, max_uniq_len, max_uniq_len)) mask = np.array(mask).astype("float32").reshape((batch_size, -1, 1)) label = np.array(label).astype("int64").reshape((batch_size, 1)) - return zip(items, seq_index, last_index, adj_in, adj_out, mask, label) + return list( + zip(items, seq_index, last_index, adj_in, adj_out, mask, label)) def batch_reader(self, batch_size, batch_group_size, train=True): def _reader(): diff --git a/models/recall/gnn/reader.py b/models/recall/gnn/reader.py index 0f553d6f9719792497df3608da3527aa13d03763..5ea4cfeef1513f2a76378e4341863fa9e3880460 100755 --- a/models/recall/gnn/reader.py +++ b/models/recall/gnn/reader.py @@ -94,7 +94,8 @@ class Reader(ReaderBase): (batch_size, max_uniq_len, max_uniq_len)) mask = np.array(mask).astype("float32").reshape((batch_size, -1, 1)) label = np.array(label).astype("int64").reshape((batch_size, 1)) - return zip(items, seq_index, last_index, adj_in, adj_out, mask, label) + return list( + zip(items, seq_index, last_index, adj_in, adj_out, mask, label)) def batch_reader(self, batch_size, batch_group_size, train=True): def _reader(): diff --git a/models/recall/gru4rec/rsc15_reader.py b/models/recall/gru4rec/rsc15_reader.py index fa4ae44d2ce2102ec282f0b84c09c6bb78efb4dc..6794ec59a70d95bab22cd27298f21a34aa5649cf 100644 --- a/models/recall/gru4rec/rsc15_reader.py +++ b/models/recall/gru4rec/rsc15_reader.py @@ -37,6 +37,6 @@ class Reader(ReaderBase): trg_seq = l[1:] trg_seq = [int(e) for e in trg_seq] feature_name = ["src_wordseq", "dst_wordseq"] - yield zip(feature_name, [src_seq] + [trg_seq]) + yield list(zip(feature_name, [src_seq] + [trg_seq])) return reader diff --git a/models/recall/ncf/movielens_infer_reader.py b/models/recall/ncf/movielens_infer_reader.py index 222e6d3d256630c2f15e555d2f4bf8a88b460f74..1e0e62af83473021884375ba86ba0be0161f6742 100644 --- a/models/recall/ncf/movielens_infer_reader.py +++ b/models/recall/ncf/movielens_infer_reader.py @@ -35,7 +35,7 @@ class Reader(ReaderBase): features = line.strip().split(',') feature_name = ["user_input", "item_input"] - yield zip(feature_name, - [[int(features[0])]] + [[int(features[1])]]) + yield list( + zip(feature_name, [[int(features[0])]] + [[int(features[1])]])) return reader diff --git a/models/recall/ncf/movielens_reader.py b/models/recall/ncf/movielens_reader.py index 4a0995cc80b902a995a4560f28f50ac9bf2157ef..093f079fdcccda33ba023221f46e037e113de3db 100644 --- a/models/recall/ncf/movielens_reader.py +++ b/models/recall/ncf/movielens_reader.py @@ -35,7 +35,8 @@ class Reader(ReaderBase): features = line.strip().split(',') feature_name = ["user_input", "item_input", "label"] - yield zip(feature_name, [[int(features[0])]] + - [[int(features[1])]] + [[int(features[2])]]) + yield list( + zip(feature_name, [[int(features[0])]] + [[int(features[1])]] + + [[int(features[2])]])) return reader diff --git a/models/recall/ssr/ssr_infer_reader.py b/models/recall/ssr/ssr_infer_reader.py index 61a67b5898085b763e632d17459615c03ebbdbb3..59c90fc5d1f7b3763528045e7cbe320e013842d6 100644 --- a/models/recall/ssr/ssr_infer_reader.py +++ b/models/recall/ssr/ssr_infer_reader.py @@ -40,9 +40,9 @@ class Reader(ReaderBase): src = conv_ids[:boundary] pos_tgt = [conv_ids[boundary]] feature_name = ["user", "all_item", "p_item"] - yield zip( - feature_name, - [src] + [np.arange(self.vocab_size).astype("int64").tolist()] + - [pos_tgt]) + yield list( + zip(feature_name, [src] + [ + np.arange(self.vocab_size).astype("int64").tolist() + ] + [pos_tgt])) return reader diff --git a/models/recall/ssr/ssr_reader.py b/models/recall/ssr/ssr_reader.py index 2b436fd4bfa7455367211194f154d27b4a9a8ebc..5463e3b0406873b18b7c89b03d17f1373d32d0de 100644 --- a/models/recall/ssr/ssr_reader.py +++ b/models/recall/ssr/ssr_reader.py @@ -42,6 +42,6 @@ class Reader(ReaderBase): pos_tgt = [conv_ids[boundary]] neg_tgt = [self.sample_neg_from_seq(src)] feature_name = ["user", "p_item", "n_item"] - yield zip(feature_name, [src] + [pos_tgt] + [neg_tgt]) + yield list(zip(feature_name, [src] + [pos_tgt] + [neg_tgt])) return reader diff --git a/models/recall/youtube_dnn/random_reader.py b/models/recall/youtube_dnn/random_reader.py index 85bb89d4df203cb65f18b35e0c3cc26ada7fc17f..a81269ca2efdb3cf0b928ec27e9d19ff80d77aeb 100644 --- a/models/recall/youtube_dnn/random_reader.py +++ b/models/recall/youtube_dnn/random_reader.py @@ -41,10 +41,11 @@ class Reader(ReaderBase): """ feature_name = ["watch_vec", "search_vec", "other_feat", "label"] - yield zip(feature_name, - [np.random.rand(self.watch_vec_size).tolist()] + - [np.random.rand(self.search_vec_size).tolist()] + - [np.random.rand(self.other_feat_size).tolist()] + - [[np.random.randint(self.output_size)]]) + yield list( + zip(feature_name, [ + np.random.rand(self.watch_vec_size).tolist() + ] + [np.random.rand(self.search_vec_size).tolist()] + [ + np.random.rand(self.other_feat_size).tolist() + ] + [[np.random.randint(self.output_size)]])) return reader