diff --git a/core/utils/envs.py b/core/utils/envs.py index 4fe57b32a6a534c630005d42e66b1aedc7972089..7093d897e780c525e91516a0058bc90319d4e918 100755 --- a/core/utils/envs.py +++ b/core/utils/envs.py @@ -90,18 +90,12 @@ def get_global_envs(): def path_adapter(path): - def adapt(l_p): - if get_platform() == "WINDOWS": - adapted_p = l_p.split("paddlerec.")[1].replace(".", "\\") - else: - adapted_p = l_p.split("paddlerec.")[1].replace(".", "/") - return adapted_p - if path.startswith("paddlerec."): package = get_runtime_environ("PACKAGE_BASE") - return os.path.join(package, adapt(path)) + l_p = path.split("paddlerec.")[1].replace(".", "/") + return os.path.join(package, l_p) else: - return adapt(path) + return path def windows_path_converter(path): diff --git a/doc/imgs/ncf.png b/doc/imgs/ncf.png new file mode 100644 index 0000000000000000000000000000000000000000..2691ed9f851a3e1e4d7c22ac3bd6a49fe7f01b54 Binary files /dev/null and b/doc/imgs/ncf.png differ diff --git a/doc/imgs/youtube_dnn.png b/doc/imgs/youtube_dnn.png new file mode 100644 index 0000000000000000000000000000000000000000..e7480d80786ca6034ec61856effe5975ad5f72c1 Binary files /dev/null and b/doc/imgs/youtube_dnn.png differ diff --git a/models/recall/ncf/__init__.py b/models/recall/ncf/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/models/recall/ncf/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/recall/ncf/config.yaml b/models/recall/ncf/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..249f6fccefa3b8ec11376a390433dd52c84682e7 --- /dev/null +++ b/models/recall/ncf/config.yaml @@ -0,0 +1,53 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +evaluate: + reader: + batch_size: 1 + class: "{workspace}/movielens_infer_reader.py" + test_data_path: "{workspace}/data/test" + +train: + trainer: + # for cluster training + strategy: "async" + + epochs: 3 + workspace: "paddlerec.models.recall.ncf" + device: cpu + + reader: + batch_size: 2 + class: "{workspace}/movielens_reader.py" + train_data_path: "{workspace}/data/train" + + model: + models: "{workspace}/model.py" + hyper_parameters: + num_users: 6040 + num_items: 3706 + latent_dim: 8 + layers: [64, 32, 16, 8] + learning_rate: 0.001 + optimizer: adam + + save: + increment: + dirname: "increment" + epoch_interval: 2 + save_last: True + inference: + dirname: "inference" + epoch_interval: 4 + save_last: True diff --git a/models/recall/ncf/data/test/small_data.txt b/models/recall/ncf/data/test/small_data.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3c4cf5f84f66594e76603cce1f18d211ebd05a7 --- /dev/null +++ b/models/recall/ncf/data/test/small_data.txt @@ -0,0 +1,100 @@ +4764,174,1 +4764,2958,0 +4764,452,0 +4764,1946,0 +4764,3208,0 +2044,2237,1 +2044,1998,0 +2044,328,0 +2044,1542,0 +2044,1932,0 +4276,65,1 +4276,3247,0 +4276,942,0 +4276,3666,0 +4276,2222,0 +3933,682,1 +3933,2451,0 +3933,3695,0 +3933,1643,0 +3933,3568,0 +1151,1265,1 +1151,118,0 +1151,2532,0 +1151,2083,0 +1151,2350,0 +1757,876,1 +1757,201,0 +1757,3633,0 +1757,1068,0 +1757,2549,0 +3370,276,1 +3370,2435,0 +3370,606,0 +3370,910,0 +3370,2146,0 +5137,1018,1 +5137,2163,0 +5137,3167,0 +5137,2315,0 +5137,3595,0 +3933,2831,1 +3933,2881,0 +3933,2949,0 +3933,3660,0 +3933,417,0 +3102,999,1 +3102,1902,0 +3102,2161,0 +3102,3042,0 +3102,1113,0 +2022,336,1 +2022,1672,0 +2022,2656,0 +2022,3649,0 +2022,883,0 +2664,655,1 +2664,3660,0 +2664,1711,0 +2664,3386,0 +2664,1668,0 +25,701,1 +25,32,0 +25,2482,0 +25,3177,0 +25,2767,0 +1738,1643,1 +1738,2187,0 +1738,228,0 +1738,650,0 +1738,3101,0 +5411,1241,1 +5411,2546,0 +5411,3019,0 +5411,3618,0 +5411,1674,0 +638,579,1 +638,3512,0 +638,783,0 +638,2111,0 +638,1880,0 +3554,200,1 +3554,2893,0 +3554,2428,0 +3554,969,0 +3554,2741,0 +4283,1074,1 +4283,3056,0 +4283,2032,0 +4283,405,0 +4283,1505,0 +5111,200,1 +5111,3488,0 +5111,477,0 +5111,2790,0 +5111,40,0 +3964,515,1 +3964,1528,0 +3964,2173,0 +3964,1701,0 +3964,2832,0 diff --git a/models/recall/ncf/data/train/small_data.txt b/models/recall/ncf/data/train/small_data.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3c4cf5f84f66594e76603cce1f18d211ebd05a7 --- /dev/null +++ b/models/recall/ncf/data/train/small_data.txt @@ -0,0 +1,100 @@ +4764,174,1 +4764,2958,0 +4764,452,0 +4764,1946,0 +4764,3208,0 +2044,2237,1 +2044,1998,0 +2044,328,0 +2044,1542,0 +2044,1932,0 +4276,65,1 +4276,3247,0 +4276,942,0 +4276,3666,0 +4276,2222,0 +3933,682,1 +3933,2451,0 +3933,3695,0 +3933,1643,0 +3933,3568,0 +1151,1265,1 +1151,118,0 +1151,2532,0 +1151,2083,0 +1151,2350,0 +1757,876,1 +1757,201,0 +1757,3633,0 +1757,1068,0 +1757,2549,0 +3370,276,1 +3370,2435,0 +3370,606,0 +3370,910,0 +3370,2146,0 +5137,1018,1 +5137,2163,0 +5137,3167,0 +5137,2315,0 +5137,3595,0 +3933,2831,1 +3933,2881,0 +3933,2949,0 +3933,3660,0 +3933,417,0 +3102,999,1 +3102,1902,0 +3102,2161,0 +3102,3042,0 +3102,1113,0 +2022,336,1 +2022,1672,0 +2022,2656,0 +2022,3649,0 +2022,883,0 +2664,655,1 +2664,3660,0 +2664,1711,0 +2664,3386,0 +2664,1668,0 +25,701,1 +25,32,0 +25,2482,0 +25,3177,0 +25,2767,0 +1738,1643,1 +1738,2187,0 +1738,228,0 +1738,650,0 +1738,3101,0 +5411,1241,1 +5411,2546,0 +5411,3019,0 +5411,3618,0 +5411,1674,0 +638,579,1 +638,3512,0 +638,783,0 +638,2111,0 +638,1880,0 +3554,200,1 +3554,2893,0 +3554,2428,0 +3554,969,0 +3554,2741,0 +4283,1074,1 +4283,3056,0 +4283,2032,0 +4283,405,0 +4283,1505,0 +5111,200,1 +5111,3488,0 +5111,477,0 +5111,2790,0 +5111,40,0 +3964,515,1 +3964,1528,0 +3964,2173,0 +3964,1701,0 +3964,2832,0 diff --git a/models/recall/ncf/model.py b/models/recall/ncf/model.py new file mode 100644 index 0000000000000000000000000000000000000000..be7c465dc75d7186f6d63a6d1fbf604f84945891 --- /dev/null +++ b/models/recall/ncf/model.py @@ -0,0 +1,115 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle.fluid as fluid + +from paddlerec.core.utils import envs +from paddlerec.core.model import Model as ModelBase +import numpy as np + + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + + def input_data(self, is_infer=False): + user_input = fluid.data(name="user_input", shape=[-1, 1], dtype="int64", lod_level=0) + item_input = fluid.data(name="item_input", shape=[-1, 1], dtype="int64", lod_level=0) + label = fluid.data(name="label", shape=[-1, 1], dtype="int64", lod_level=0) + if is_infer: + inputs = [user_input] + [item_input] + else: + inputs = [user_input] + [item_input] + [label] + self._data_var = inputs + + return inputs + + def net(self, inputs, is_infer=False): + + num_users = envs.get_global_env("hyper_parameters.num_users", None, self._namespace) + num_items = envs.get_global_env("hyper_parameters.num_items", None, self._namespace) + latent_dim = envs.get_global_env("hyper_parameters.latent_dim", None, self._namespace) + layers = envs.get_global_env("hyper_parameters.layers", None, self._namespace) + + num_layer = len(layers) #Number of layers in the MLP + + MF_Embedding_User = fluid.embedding(input=inputs[0], + size=[num_users, latent_dim], + param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), + is_sparse=True) + MF_Embedding_Item = fluid.embedding(input=inputs[1], + size=[num_items, latent_dim], + param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), + is_sparse=True) + + MLP_Embedding_User = fluid.embedding(input=inputs[0], + size=[num_users, int(layers[0] / 2)], + param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), + is_sparse=True) + MLP_Embedding_Item = fluid.embedding(input=inputs[1], + size=[num_items, int(layers[0] / 2)], + param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), + is_sparse=True) + + # MF part + mf_user_latent = fluid.layers.flatten(x=MF_Embedding_User, axis=1) + mf_item_latent = fluid.layers.flatten(x=MF_Embedding_Item, axis=1) + mf_vector = fluid.layers.elementwise_mul(mf_user_latent, mf_item_latent) + + # MLP part + # The 0-th layer is the concatenation of embedding layers + mlp_user_latent = fluid.layers.flatten(x=MLP_Embedding_User, axis=1) + mlp_item_latent = fluid.layers.flatten(x=MLP_Embedding_Item, axis=1) + mlp_vector = fluid.layers.concat(input=[mlp_user_latent, mlp_item_latent], axis=-1) + + for i in range(1, num_layer): + mlp_vector = fluid.layers.fc(input=mlp_vector, + size=layers[i], + act='relu', + param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(mlp_vector.shape[1])), + regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)), + name='layer_' + str(i)) + + # Concatenate MF and MLP parts + predict_vector = fluid.layers.concat(input=[mf_vector, mlp_vector], axis=-1) + + # Final prediction layer + prediction = fluid.layers.fc(input=predict_vector, + size=1, + act='sigmoid', + param_attr=fluid.initializer.MSRAInitializer(uniform=True), + name='prediction') + if is_infer: + self._infer_results["prediction"] = prediction + return + + cost = fluid.layers.log_loss(input=prediction, label=fluid.layers.cast(x=inputs[2], dtype='float32')) + avg_cost = fluid.layers.mean(cost) + + + self._cost = avg_cost + self._metrics["cost"] = avg_cost + + + def train_net(self): + input_data = self.input_data() + self.net(input_data) + + + def infer_net(self): + self._infer_data_var = self.input_data(is_infer=True) + self._infer_data_loader = fluid.io.DataLoader.from_generator( + feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + self.net(self._infer_data_var, is_infer=True) diff --git a/models/recall/ncf/movielens_infer_reader.py b/models/recall/ncf/movielens_infer_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..04f159962e89b28d0e044cfbbc1fcae5a15f3f0d --- /dev/null +++ b/models/recall/ncf/movielens_infer_reader.py @@ -0,0 +1,39 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +from paddlerec.core.reader import Reader +from paddlerec.core.utils import envs +from collections import defaultdict +import numpy as np + + +class EvaluateReader(Reader): + def init(self): + pass + + def generate_sample(self, line): + """ + Read the data line by line and process it as a dictionary + """ + + def reader(): + """ + This function needs to be implemented by the user, based on data format + """ + features = line.strip().split(',') + + feature_name = ["user_input", "item_input"] + yield zip(feature_name, [[int(features[0])]] + [[int(features[1])]]) + return reader diff --git a/models/recall/ncf/movielens_reader.py b/models/recall/ncf/movielens_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..789a71add824e9759734be3bc571ec2152e9f50c --- /dev/null +++ b/models/recall/ncf/movielens_reader.py @@ -0,0 +1,42 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +from paddlerec.core.reader import Reader +from paddlerec.core.utils import envs +from collections import defaultdict +import numpy as np + + +class TrainReader(Reader): + def init(self): + pass + + def generate_sample(self, line): + """ + Read the data line by line and process it as a dictionary + """ + + def reader(): + """ + This function needs to be implemented by the user, based on data format + """ + features = line.strip().split(',') + + feature_name = ["user_input", "item_input", "label"] + yield zip(feature_name, [[int(features[0])]] + [[int(features[1])]] + [[int(features[2])]]) + + return reader + + diff --git a/models/recall/readme.md b/models/recall/readme.md index 806e0a99e7576b96ee6d64bb1acc9e695dacb281..664ced053934d461fb2ed4311a8fd4a1f4d9bd8a 100755 --- a/models/recall/readme.md +++ b/models/recall/readme.md @@ -1,7 +1,7 @@ # 召回模型库 ## 简介 -我们提供了常见的召回任务中使用的模型算法的PaddleRec实现, 单机训练&预测效果指标以及分布式训练&预测性能指标等。实现的召回模型包括 [SR-GNN](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/recall/gnn)、[GRU4REC](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/recall/gru4rec)、[Sequence Semantic Retrieval Model](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/recall/ssr)、[Word2Vector](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/recall/word2vec)。 +我们提供了常见的召回任务中使用的模型算法的PaddleRec实现, 单机训练&预测效果指标以及分布式训练&预测性能指标等。实现的召回模型包括 [SR-GNN](gnn)、[GRU4REC](gru4rec)、[Sequence Semantic Retrieval Model](ssr)、[Word2Vector](word2vec)、[Youtube_DNN](youtube_dnn)、[ncf](ncf)。 模型算法库在持续添加中,欢迎关注。 @@ -9,7 +9,7 @@ * [整体介绍](#整体介绍) * [召回模型列表](#召回模型列表) * [使用教程](#使用教程) - * [训练&预测](#训练&预测) + * [训练 预测](#训练 预测) * [效果对比](#效果对比) * [模型效果列表](#模型效果列表) @@ -20,7 +20,9 @@ | :------------------: | :--------------------: | :---------: | | Word2Vec | word2vector | [Distributed Representations of Words and Phrases and their Compositionality](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)(2013) | | GRU4REC | SR-GRU | [Session-based Recommendations with Recurrent Neural Networks](https://arxiv.org/abs/1511.06939)(2015) | +| Youtube_DNN | Youtube_DNN | [Deep Neural Networks for YouTube Recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)(2016) | | SSR | Sequence Semantic Retrieval Model | [Multi-Rate Deep Learning for Temporal Recommendation](http://sonyis.me/paperpdf/spr209-song_sigir16.pdf)(2016) | +| NCF | Neural Collaborative Filtering | [Neural Collaborative Filtering](https://arxiv.org/pdf/1708.05031.pdf)(2017) | | GNN | SR-GNN | [Session-based Recommendation with Graph Neural Networks](https://arxiv.org/abs/1811.00855)(2018) | 下面是每个模型的简介(注:图片引用自链接中的论文) @@ -35,31 +37,45 @@

+[Youtube_DNN](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf): +

+ +

+ [SSR](http://sonyis.me/paperpdf/spr209-song_sigir16.pdf):

+[NCF](https://arxiv.org/pdf/1708.05031.pdf): +

+ +

+ [GNN](https://arxiv.org/abs/1811.00855):

## 使用教程 -### 训练&预测 +### 训练 预测 ```shell python -m paddlerec.run -m paddlerec.models.recall.word2vec # word2vec python -m paddlerec.run -m paddlerec.models.recall.ssr # ssr python -m paddlerec.run -m paddlerec.models.recall.gru4rec # gru4rec python -m paddlerec.run -m paddlerec.models.recall.gnn # gnn +python -m paddlerec.run -m paddlerec.models.recall.ncf # ncf +python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn ``` ## 效果对比 ### 模型效果列表 -| 数据集 | 模型 | loss | Recall@20 | +| 数据集 | 模型 | HR@10 | Recall@20 | | :------------------: | :--------------------: | :---------: |:---------: | | DIGINETICA | GNN | -- | 0.507 | | RSC15 | GRU4REC | -- | 0.670 | | RSC15 | SSR | -- | 0.590 | +| MOVIELENS | NCF | 0.688 | -- | +| -- | Youtube | -- | -- | | 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 | diff --git a/models/recall/youtube_dnn/__init__.py b/models/recall/youtube_dnn/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/models/recall/youtube_dnn/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/recall/youtube_dnn/config.yaml b/models/recall/youtube_dnn/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6cffbaba0abe7b42dfb653b1876f71936827a7bc --- /dev/null +++ b/models/recall/youtube_dnn/config.yaml @@ -0,0 +1,49 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +train: + trainer: + # for cluster training + strategy: "async" + + epochs: 3 + workspace: "paddlerec.models.recall.youtube_dnn" + device: cpu + + reader: + batch_size: 2 + class: "{workspace}/random_reader.py" + train_data_path: "{workspace}/data/train" + + model: + models: "{workspace}/model.py" + hyper_parameters: + watch_vec_size: 64 + search_vec_size: 64 + other_feat_size: 64 + output_size: 100 + layers: [128, 64, 32] + learning_rate: 0.01 + optimizer: sgd + + save: + increment: + dirname: "increment" + epoch_interval: 2 + save_last: True + inference: + dirname: "inference" + epoch_interval: 4 + save_last: True diff --git a/models/recall/youtube_dnn/data/test/small_data.txt b/models/recall/youtube_dnn/data/test/small_data.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3c4cf5f84f66594e76603cce1f18d211ebd05a7 --- /dev/null +++ b/models/recall/youtube_dnn/data/test/small_data.txt @@ -0,0 +1,100 @@ +4764,174,1 +4764,2958,0 +4764,452,0 +4764,1946,0 +4764,3208,0 +2044,2237,1 +2044,1998,0 +2044,328,0 +2044,1542,0 +2044,1932,0 +4276,65,1 +4276,3247,0 +4276,942,0 +4276,3666,0 +4276,2222,0 +3933,682,1 +3933,2451,0 +3933,3695,0 +3933,1643,0 +3933,3568,0 +1151,1265,1 +1151,118,0 +1151,2532,0 +1151,2083,0 +1151,2350,0 +1757,876,1 +1757,201,0 +1757,3633,0 +1757,1068,0 +1757,2549,0 +3370,276,1 +3370,2435,0 +3370,606,0 +3370,910,0 +3370,2146,0 +5137,1018,1 +5137,2163,0 +5137,3167,0 +5137,2315,0 +5137,3595,0 +3933,2831,1 +3933,2881,0 +3933,2949,0 +3933,3660,0 +3933,417,0 +3102,999,1 +3102,1902,0 +3102,2161,0 +3102,3042,0 +3102,1113,0 +2022,336,1 +2022,1672,0 +2022,2656,0 +2022,3649,0 +2022,883,0 +2664,655,1 +2664,3660,0 +2664,1711,0 +2664,3386,0 +2664,1668,0 +25,701,1 +25,32,0 +25,2482,0 +25,3177,0 +25,2767,0 +1738,1643,1 +1738,2187,0 +1738,228,0 +1738,650,0 +1738,3101,0 +5411,1241,1 +5411,2546,0 +5411,3019,0 +5411,3618,0 +5411,1674,0 +638,579,1 +638,3512,0 +638,783,0 +638,2111,0 +638,1880,0 +3554,200,1 +3554,2893,0 +3554,2428,0 +3554,969,0 +3554,2741,0 +4283,1074,1 +4283,3056,0 +4283,2032,0 +4283,405,0 +4283,1505,0 +5111,200,1 +5111,3488,0 +5111,477,0 +5111,2790,0 +5111,40,0 +3964,515,1 +3964,1528,0 +3964,2173,0 +3964,1701,0 +3964,2832,0 diff --git a/models/recall/youtube_dnn/data/train/samll_data.txt b/models/recall/youtube_dnn/data/train/samll_data.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3c4cf5f84f66594e76603cce1f18d211ebd05a7 --- /dev/null +++ b/models/recall/youtube_dnn/data/train/samll_data.txt @@ -0,0 +1,100 @@ +4764,174,1 +4764,2958,0 +4764,452,0 +4764,1946,0 +4764,3208,0 +2044,2237,1 +2044,1998,0 +2044,328,0 +2044,1542,0 +2044,1932,0 +4276,65,1 +4276,3247,0 +4276,942,0 +4276,3666,0 +4276,2222,0 +3933,682,1 +3933,2451,0 +3933,3695,0 +3933,1643,0 +3933,3568,0 +1151,1265,1 +1151,118,0 +1151,2532,0 +1151,2083,0 +1151,2350,0 +1757,876,1 +1757,201,0 +1757,3633,0 +1757,1068,0 +1757,2549,0 +3370,276,1 +3370,2435,0 +3370,606,0 +3370,910,0 +3370,2146,0 +5137,1018,1 +5137,2163,0 +5137,3167,0 +5137,2315,0 +5137,3595,0 +3933,2831,1 +3933,2881,0 +3933,2949,0 +3933,3660,0 +3933,417,0 +3102,999,1 +3102,1902,0 +3102,2161,0 +3102,3042,0 +3102,1113,0 +2022,336,1 +2022,1672,0 +2022,2656,0 +2022,3649,0 +2022,883,0 +2664,655,1 +2664,3660,0 +2664,1711,0 +2664,3386,0 +2664,1668,0 +25,701,1 +25,32,0 +25,2482,0 +25,3177,0 +25,2767,0 +1738,1643,1 +1738,2187,0 +1738,228,0 +1738,650,0 +1738,3101,0 +5411,1241,1 +5411,2546,0 +5411,3019,0 +5411,3618,0 +5411,1674,0 +638,579,1 +638,3512,0 +638,783,0 +638,2111,0 +638,1880,0 +3554,200,1 +3554,2893,0 +3554,2428,0 +3554,969,0 +3554,2741,0 +4283,1074,1 +4283,3056,0 +4283,2032,0 +4283,405,0 +4283,1505,0 +5111,200,1 +5111,3488,0 +5111,477,0 +5111,2790,0 +5111,40,0 +3964,515,1 +3964,1528,0 +3964,2173,0 +3964,1701,0 +3964,2832,0 diff --git a/models/recall/youtube_dnn/model.py b/models/recall/youtube_dnn/model.py new file mode 100644 index 0000000000000000000000000000000000000000..63d1fd2f49aad3c59272e560ed64442ab5f2f41e --- /dev/null +++ b/models/recall/youtube_dnn/model.py @@ -0,0 +1,86 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle.fluid as fluid + +from paddlerec.core.utils import envs +from paddlerec.core.model import Model as ModelBase +import numpy as np + + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + + def input_data(self, is_infer=False): + + watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size", None, self._namespace) + search_vec_size = envs.get_global_env("hyper_parameters.search_vec_size", None, self._namespace) + other_feat_size = envs.get_global_env("hyper_parameters.other_feat_size", None, self._namespace) + + watch_vec = fluid.data(name="watch_vec", shape=[None, watch_vec_size], dtype="float32") + search_vec = fluid.data(name="search_vec", shape=[None, search_vec_size], dtype="float32") + other_feat = fluid.data(name="other_feat", shape=[None, other_feat_size], dtype="float32") + label = fluid.data(name="label", shape=[None, 1], dtype="int64") + inputs = [watch_vec] + [search_vec] + [other_feat] + [label] + self._data_var = inputs + + return inputs + + def fc(self, tag, data, out_dim, active='relu'): + init_stddev = 1.0 + scales = 1.0 / np.sqrt(data.shape[1]) + + if tag == 'l4': + p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag, + initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=init_stddev * scales)) + else: + p_attr = None + + b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1)) + + out = fluid.layers.fc(input=data, + size=out_dim, + act=active, + param_attr=p_attr, + bias_attr =b_attr, + name=tag) + return out + + def net(self, inputs): + output_size = envs.get_global_env("hyper_parameters.output_size", None, self._namespace) + layers = envs.get_global_env("hyper_parameters.layers", None, self._namespace) + concat_feats = fluid.layers.concat(input=inputs[:-1], axis=-1) + + l1 = self.fc('l1', concat_feats, layers[0], 'relu') + l2 = self.fc('l2', l1, layers[1], 'relu') + l3 = self.fc('l3', l2, layers[2], 'relu') + l4 = self.fc('l4', l3, output_size, 'softmax') + + num_seqs = fluid.layers.create_tensor(dtype='int64') + acc = fluid.layers.accuracy(input=l4, label=inputs[-1], total=num_seqs) + + cost = fluid.layers.cross_entropy(input=l4, label=inputs[-1]) + avg_cost = fluid.layers.mean(cost) + + self._cost = avg_cost + self._metrics["acc"] = acc + + def train_net(self): + input_data = self.input_data() + self.net(input_data) + + def infer_net(self): + pass diff --git a/models/recall/youtube_dnn/random_reader.py b/models/recall/youtube_dnn/random_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..723c66f9c0fe94d8fe0d36e6a3e75e9945768d40 --- /dev/null +++ b/models/recall/youtube_dnn/random_reader.py @@ -0,0 +1,47 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +from paddlerec.core.reader import Reader +from paddlerec.core.utils import envs +from collections import defaultdict +import numpy as np + + +class TrainReader(Reader): + def init(self): + self.watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size", None, "train.model") + self.search_vec_size = envs.get_global_env("hyper_parameters.search_vec_size", None, "train.model") + self.other_feat_size = envs.get_global_env("hyper_parameters.other_feat_size", None, "train.model") + self.output_size = envs.get_global_env("hyper_parameters.output_size", None, "train.model") + + def generate_sample(self, line): + """ + the file is not used + """ + + def reader(): + """ + This function needs to be implemented by the user, based on data format + """ + + feature_name = ["watch_vec", "search_vec", "other_feat", "label"] + yield zip(feature_name, [np.random.rand(self.watch_vec_size).tolist()] + + [np.random.rand(self.search_vec_size).tolist()] + + [np.random.rand(self.other_feat_size).tolist()] + + [[np.random.randint(self.output_size)]] ) + + return reader + +