diff --git a/README.md b/README.md index 9a6b8e21b112c5895d350d3cd03ec78e6258029d..51ed968def32662605d2bfb2292ef31b8ab7c3b4 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ | Rank | [DeepFM](models/rank/deepfm/model.py) | ✓ | x | ✓ | x | [IJCAI 2017][DeepFM: A Factorization-Machine based Neural Network for CTR Prediction](https://arxiv.org/pdf/1703.04247.pdf) | | Rank | [xDeepFM](models/rank/xdeepfm/model.py) | ✓ | x | ✓ | x | [KDD 2018][xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems](https://dl.acm.org/doi/pdf/10.1145/3219819.3220023) | | Rank | [DIN](models/rank/din/model.py) | ✓ | x | ✓ | x | [KDD 2018][Deep Interest Network for Click-Through Rate Prediction](https://dl.acm.org/doi/pdf/10.1145/3219819.3219823) | + | Rank | [DIEN](models/rank/dien/model.py) | ✓ | x | ✓ | x | [AAAI 2019][Deep Interest Evolution Network for Click-Through Rate Prediction](https://www.aaai.org/ojs/index.php/AAAI/article/view/4545/4423) | | Rank | [Wide&Deep](models/rank/wide_deep/model.py) | ✓ | x | ✓ | x | [DLRS 2016][Wide & Deep Learning for Recommender Systems](https://dl.acm.org/doi/pdf/10.1145/2988450.2988454) | | Rank | [FGCNN](models/rank/fgcnn/model.py) | ✓ | ✓ | ✓ | ✓ | [WWW 2019][Feature Generation by Convolutional Neural Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1904.04447.pdf) | | Rank | [Fibinet](models/rank/fibinet/model.py) | ✓ | ✓ | ✓ | ✓ | [RecSys19][FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction]( https://arxiv.org/pdf/1905.09433.pdf) | diff --git a/README_CN.md b/README_CN.md index 1f86e6f634ee8f2cd72b00a08f236be540f03d95..81a872e90af08b237ee3ad4bdc29568e8cc0f514 100644 --- a/README_CN.md +++ b/README_CN.md @@ -60,6 +60,7 @@ | 排序 | [DeepFM](models/rank/deepfm/model.py) | ✓ | x | ✓ | x | [IJCAI 2017][DeepFM: A Factorization-Machine based Neural Network for CTR Prediction](https://arxiv.org/pdf/1703.04247.pdf) | | 排序 | [xDeepFM](models/rank/xdeepfm/model.py) | ✓ | x | ✓ | x | [KDD 2018][xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems](https://dl.acm.org/doi/pdf/10.1145/3219819.3220023) | | 排序 | [DIN](models/rank/din/model.py) | ✓ | x | ✓ | x | [KDD 2018][Deep Interest Network for Click-Through Rate Prediction](https://dl.acm.org/doi/pdf/10.1145/3219819.3219823) | + | 排序 | [DIEN](models/rank/dien/model.py) | ✓ | x | ✓ | x | [AAAI 2019][Deep Interest Evolution Network for Click-Through Rate Prediction](https://www.aaai.org/ojs/index.php/AAAI/article/view/4545/4423) | | 排序 | [Wide&Deep](models/rank/wide_deep/model.py) | ✓ | x | ✓ | x | [DLRS 2016][Wide & Deep Learning for Recommender Systems](https://dl.acm.org/doi/pdf/10.1145/2988450.2988454) | | 排序 | [FGCNN](models/rank/fgcnn/model.py) | ✓ | ✓ | ✓ | ✓ | [WWW 2019][Feature Generation by Convolutional Neural Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1904.04447.pdf) | | 排序 | [Fibinet](models/rank/fibinet/model.py) | ✓ | ✓ | ✓ | ✓ | [RecSys19][FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction]( https://arxiv.org/pdf/1905.09433.pdf) | diff --git a/models/rank/dien/__init__.py b/models/rank/dien/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/models/rank/dien/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/rank/dien/config.yaml b/models/rank/dien/config.yaml new file mode 100755 index 0000000000000000000000000000000000000000..d47a76070c581b47509c8ecf16d7e631d4c59d08 --- /dev/null +++ b/models/rank/dien/config.yaml @@ -0,0 +1,72 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# global settings +debug: false +workspace: "paddlerec.models.rank.dien" + +dataset: + - name: sample_1 + type: DataLoader + batch_size: 5 + data_path: "{workspace}/data/train_data" + data_converter: "{workspace}/reader.py" + - name: infer_sample + type: DataLoader + batch_size: 5 + data_path: "{workspace}/data/train_data" + data_converter: "{workspace}/reader.py" + +hyper_parameters: + optimizer: + class: SGD + learning_rate: 0.0001 + use_DataLoader: True + item_emb_size: 64 + cat_emb_size: 64 + is_sparse: False + item_count: 63001 + cat_count: 801 + + act: "sigmoid" + + +mode: train_runner + +runner: + - name: train_runner + class: train + epochs: 1 + device: cpu + init_model_path: "" + save_checkpoint_interval: 1 + save_inference_interval: 1 + save_checkpoint_path: "increment" + save_inference_path: "inference" + print_interval: 1 + - name: infer_runner + class: infer + device: cpu + init_model_path: "increment/0" + print_interval: 1 + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: sample_1 + thread_num: 1 +#- name: infer_phase +# model: "{workspace}/model.py" +# dataset_name: infer_sample +# thread_num: 1 diff --git a/models/rank/dien/data/build_dataset.py b/models/rank/dien/data/build_dataset.py new file mode 100755 index 0000000000000000000000000000000000000000..b0ed187800b2f9f44d4dd0d34df204759059ac06 --- /dev/null +++ b/models/rank/dien/data/build_dataset.py @@ -0,0 +1,101 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import random +import pickle + +random.seed(1234) + +print("read and process data") + +with open('./raw_data/remap.pkl', 'rb') as f: + reviews_df = pickle.load(f) + cate_list = pickle.load(f) + user_count, item_count, cate_count, example_count = pickle.load(f) + +train_set = [] +test_set = [] +for reviewerID, hist in reviews_df.groupby('reviewerID'): + pos_list = hist['asin'].tolist() + + def gen_neg(): + neg = pos_list[0] + while neg in pos_list: + neg = random.randint(0, item_count - 1) + return neg + + neg_list = [gen_neg() for i in range(len(pos_list))] + + for i in range(1, len(pos_list)): + hist = pos_list[:i] + if i != len(pos_list) - 1: + train_set.append((reviewerID, hist, pos_list[i], 1)) + train_set.append((reviewerID, hist, neg_list[i], 0)) + else: + label = (pos_list[i], neg_list[i]) + test_set.append((reviewerID, hist, label)) + +random.shuffle(train_set) +random.shuffle(test_set) + +assert len(test_set) == user_count + + +def print_to_file(data, fout): + for i in range(len(data)): + fout.write(str(data[i])) + if i != len(data) - 1: + fout.write(' ') + else: + fout.write(';') + + +print("make train data") +with open("paddle_train.txt", "w") as fout: + for line in train_set: + history = line[1] + target = line[2] + label = line[3] + cate = [cate_list[x] for x in history] + print_to_file(history, fout) + print_to_file(cate, fout) + fout.write(str(target) + ";") + fout.write(str(cate_list[target]) + ";") + fout.write(str(label) + "\n") + +print("make test data") +with open("paddle_test.txt", "w") as fout: + for line in test_set: + history = line[1] + target = line[2] + cate = [cate_list[x] for x in history] + + print_to_file(history, fout) + print_to_file(cate, fout) + fout.write(str(target[0]) + ";") + fout.write(str(cate_list[target[0]]) + ";") + fout.write("1\n") + + print_to_file(history, fout) + print_to_file(cate, fout) + fout.write(str(target[1]) + ";") + fout.write(str(cate_list[target[1]]) + ";") + fout.write("0\n") + +print("make config data") +with open('config.txt', 'w') as f: + f.write(str(user_count) + "\n") + f.write(str(item_count) + "\n") + f.write(str(cate_count) + "\n") diff --git a/models/rank/dien/data/convert_pd.py b/models/rank/dien/data/convert_pd.py new file mode 100755 index 0000000000000000000000000000000000000000..a66290e1561084a10756ab98c3d70b9a5ac5a6ed --- /dev/null +++ b/models/rank/dien/data/convert_pd.py @@ -0,0 +1,41 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import pickle +import pandas as pd + + +def to_df(file_path): + with open(file_path, 'r') as fin: + df = {} + i = 0 + for line in fin: + df[i] = eval(line) + i += 1 + df = pd.DataFrame.from_dict(df, orient='index') + return df + + +print("start to analyse reviews_Electronics_5.json") +reviews_df = to_df('./raw_data/reviews_Electronics_5.json') +with open('./raw_data/reviews.pkl', 'wb') as f: + pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL) + +print("start to analyse meta_Electronics.json") +meta_df = to_df('./raw_data/meta_Electronics.json') +meta_df = meta_df[meta_df['asin'].isin(reviews_df['asin'].unique())] +meta_df = meta_df.reset_index(drop=True) +with open('./raw_data/meta.pkl', 'wb') as f: + pickle.dump(meta_df, f, pickle.HIGHEST_PROTOCOL) diff --git a/models/rank/dien/data/data_process.sh b/models/rank/dien/data/data_process.sh new file mode 100755 index 0000000000000000000000000000000000000000..7bcfc55f43119315d543e06f16fe0ebc0fecb9fc --- /dev/null +++ b/models/rank/dien/data/data_process.sh @@ -0,0 +1,15 @@ +#! /bin/bash + +set -e +echo "begin download data" +mkdir raw_data +cd raw_data +wget -c http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz +gzip -d reviews_Electronics_5.json.gz +wget -c http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Electronics.json.gz +gzip -d meta_Electronics.json.gz +echo "download data successfully" + +cd .. +python convert_pd.py +python remap_id.py diff --git a/models/rank/dien/data/remap_id.py b/models/rank/dien/data/remap_id.py new file mode 100755 index 0000000000000000000000000000000000000000..ee6983d7f0769a58352f61a0a05bbd81c6ccbc13 --- /dev/null +++ b/models/rank/dien/data/remap_id.py @@ -0,0 +1,62 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import random +import pickle +import numpy as np + +random.seed(1234) + +with open('./raw_data/reviews.pkl', 'rb') as f: + reviews_df = pickle.load(f) + reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']] +with open('./raw_data/meta.pkl', 'rb') as f: + meta_df = pickle.load(f) + meta_df = meta_df[['asin', 'categories']] + meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1]) + + +def build_map(df, col_name): + key = sorted(df[col_name].unique().tolist()) + m = dict(zip(key, range(len(key)))) + df[col_name] = df[col_name].map(lambda x: m[x]) + return m, key + + +asin_map, asin_key = build_map(meta_df, 'asin') +cate_map, cate_key = build_map(meta_df, 'categories') +revi_map, revi_key = build_map(reviews_df, 'reviewerID') + +user_count, item_count, cate_count, example_count =\ + len(revi_map), len(asin_map), len(cate_map), reviews_df.shape[0] +print('user_count: %d\titem_count: %d\tcate_count: %d\texample_count: %d' % + (user_count, item_count, cate_count, example_count)) + +meta_df = meta_df.sort_values('asin') +meta_df = meta_df.reset_index(drop=True) +reviews_df['asin'] = reviews_df['asin'].map(lambda x: asin_map[x]) +reviews_df = reviews_df.sort_values(['reviewerID', 'unixReviewTime']) +reviews_df = reviews_df.reset_index(drop=True) +reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']] + +cate_list = [meta_df['categories'][i] for i in range(len(asin_map))] +cate_list = np.array(cate_list, dtype=np.int32) + +with open('./raw_data/remap.pkl', 'wb') as f: + pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL) # uid, iid + pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL) # cid of iid line + pickle.dump((user_count, item_count, cate_count, example_count), f, + pickle.HIGHEST_PROTOCOL) + pickle.dump((asin_key, cate_key, revi_key), f, pickle.HIGHEST_PROTOCOL) diff --git a/models/rank/dien/data/train_data/paddle_train.100.txt b/models/rank/dien/data/train_data/paddle_train.100.txt new file mode 100755 index 0000000000000000000000000000000000000000..2696a2917f1cb13d070857af67ec56f849f59be5 --- /dev/null +++ b/models/rank/dien/data/train_data/paddle_train.100.txt @@ -0,0 +1,100 @@ +3737 19450;288 196;18486;674;1 +3647 4342 6855 3805;281 463 558 674;4206;463;1 +1805 4309;87 87;21354;556;1 +18209 20753;649 241;51924;610;0 +13150;351;41455;792;1 +35120 40418;157 714;52035;724;0 +13515 20363 25356 26891 24200 11694 33378 34483 35370 27311 40689 33319 28819;558 123 61 110 738 692 110 629 714 463 281 142 382;45554;558;1 +19254 9021 28156 19193 24602 31171;189 462 140 474 157 614;48895;350;1 +4716;194;32497;484;1 +43799 47108;368 140;3503;25;0 +20554 41800 1582 1951;339 776 694 703;4320;234;0 +39713 44272 45136 11687;339 339 339 140;885;168;0 +14398 33997;756 347;20438;703;1 +29341 25727;142 616;4170;512;0 +12197 10212;558 694;31559;24;0 +11551;351;53485;436;1 +4553;196;7331;158;1 +15190 19994 33946 30716 31879 45178 51598 46814;249 498 612 142 746 746 558 174;24353;251;0 +4931 2200 8338 23530;785 792 277 523;3525;251;0 +8881 13274 12683 14696 27693 1395 44373 59704 27762 54268 30326 11811 45371 51598 55859 56039 57678 47250 2073 38932;479 558 190 708 335 684 339 725 446 446 44 575 280 558 262 197 368 111 749 188;12361;616;1 +16297 16797 18629 20922 16727 33946 51165 36796;281 436 462 339 611 612 288 64;34724;288;1 +22237;188;40786;637;0 +5396 39993 42681 49832 11208 34954 36523 45523 51618;351 339 687 281 708 142 629 656 142;38201;571;0 +8881 9029 17043 16620 15021 32706;479 110 110 749 598 251;34941;657;0 +53255;444;37953;724;1 +1010 4172 8613 11562 11709 13118 2027 15446;674 606 708 436 179 179 692 436;36998;703;0 +22357 24305 15222 19254 22914;189 504 113 189 714;18201;398;1 +1905;694;23877;347;1 +8444 17868;765 712;50732;44;0 +42301 26186 38086;142 450 744;61547;714;0 +18156 35717 32070 45650 47208 20975 36409 44856 48072 15860 47043 53289 53314 33470 47926;157 281 650 142 749 291 707 714 157 205 388 474 708 498 495;48170;746;1 +56219;108;1988;389;0 +22907;83;752;175;0 +22009 32410 42987 48720 683 1289 2731 4736 6306 8442 8946 9928 11536 14947 15793 16694 21736 25156 25797 25874 26573 30318 33946 35420 1492 5236 5555 6625 8867 9638 11443 20225 25965 27273 29001 35302 42336 43347 36907 2012;317 462 291 142 694 10 574 278 708 281 131 142 367 281 258 345 616 708 111 115 339 113 612 24 368 616 39 197 44 214 558 108 616 558 210 210 142 142 262 351;25540;701;0 +20434;196;18056;189;0 +628 5461;194 234;43677;351;0 +16953 15149 45143 23587 5094 25105 51913 54645;484 281 449 792 524 395 388 731;57655;75;1 +13584 7509;234 744;33062;749;1 +170 208 77 109 738 742 1118 15349 255 12067 21643 55453;330 559 744 115 558 674 111 351 694 694 746 111;9821;694;1 +4970 16672;540 746;25685;666;1 +17240 60546;708 629;42110;142;1 +31503 31226 50628 22444;142 156 142 203;47812;749;0 +2443 1763 3403 4225 8951;25 707 351 177 351;7954;351;1 +3748;351;9171;657;1 +1755 26204 42716 32991;446 188 497 746;23910;395;1 +20637 27122;558 44;19669;301;0 +406 872 306 218 883 1372 1705 1709 7774 2376 2879 2881 13329 4992 13594 11106 7131 8631 1736 17585 2568 16896 21971 10296 22361 24108 23300 11793 25351 2648 24593 12692 23883 25345 27129 26321 21627 20738 17784 28785 29281 28366 24723 24319 12083 29882 29974 30443 30428 17072 9783 16700 29421 32253 28830 31299 28792 33931 24973 33112 21717 28339 23978 18649 1841 17635 19696 37448 20862 30492 35736 37450 2633 8675 17412 25960 28389 31032 37157 14555 4996 33388 33393 36237 38946 22793 24337 34963 38819 41165 39551 43019 15570 25129 34593 38385 42915 41407 29907 31289 44229 24267 34975 39462 33274 43251 38302 35502 44056 44675 45233 47690 33472 50149 29409 47183 49188 48192 50628 24103 28313 28358 38882 44330 44346 2019 2484 2675 26396 48143 46039 47722 48559 41719 41720 43920 41983 51235 34964 27287 51915 33586 43630 47258 52137 40954 35120 29572 42405 53559 44900 45761;241 558 395 368 498 110 463 611 558 106 10 112 251 241 48 112 601 674 241 347 733 502 194 119 179 179 578 692 281 115 523 113 281 35 765 196 339 115 90 164 790 708 142 115 342 351 391 281 48 119 74 505 606 68 239 687 687 281 110 281 449 351 38 351 164 176 449 115 70 25 687 115 39 756 35 175 704 119 38 53 115 38 38 142 262 188 614 277 388 615 49 738 106 733 486 666 571 385 708 119 331 463 578 288 142 106 611 611 39 523 388 142 726 702 498 61 142 714 142 654 277 733 603 498 299 97 726 115 637 703 558 74 629 142 142 347 629 746 277 8 49 389 629 408 733 345 157 704 115 398 611 239;49174;368;0 +29206 60955;351 684;61590;76;1 +8427 9692 4411 3266 18234 22774;746 281 396 651 446 44;23393;351;0 +13051 15844 9347 21973 18365 24220 28429 4799 27488 21623 13870 29346 27208 31075 31635 28390 30777 29334 33438 16469 29423 29237 25527 34808 37656 21324 38263 6699 33167 9295 40828 18894;339 342 657 194 20 466 179 225 436 364 707 115 36 523 351 674 694 391 674 500 342 216 707 345 616 495 436 363 395 189 203 766;56816;396;0 +5653 18042 21137 17277 23847 25109 21837 17163 22786 27380 20789 27737 30164 36402 37166 38647 31746 38915 38366 11151 43757 38284 29817 41717 41899 43279 47539 37850 39789 43817 11208 53361 29247 51483 39940 50917 53618 44055 48997;593 251 616 110 110 110 110 105 436 558 311 142 603 738 398 766 1 351 142 584 674 597 142 483 351 157 373 142 629 39 708 251 339 142 262 1 113 142 462;13418;558;0 +8719 11172;311 217;11707;179;1 +14968 8297 22914 5998 20253 41425 42664 46745 51179 33481 46814 55135 53124 61559;463 766 714 486 628 444 281 714 142 242 174 118 714 714;61908;714;1 +61119;714;22907;83;0 +26172;157;54529;44;0 +13830 10377 8193 16072 13543 18741 24205 18281 37272 27784 16658 27884;384 739 558 739 135 347 558 687 498 142 197 746;34463;177;1 +20842 11756 22110 30562 30697;189 68 483 776 225;49113;483;0 +13646 46782 54138;142 798 142;43698;347;0 +36434;241;51537;629;0 +44121 35325;397 653;43399;397;1 +6438 11107 20073 25026 24434 35533 6318 25028 28352 32359 25734 26280 41466 25192 1909 11753 17770 24301 1728 9693 36444 40256 17961 36780 41093 8788 439 46397 46269 50462 40395 437 2582 4455 12361 14325 22294 26153 26607 29205 29878 33491 38795 41585 45480 51567 54245 19796 52446;356 194 389 89 474 330 347 384 330 90 19 385 177 68 624 68 674 463 624 194 177 389 197 642 239 111 115 113 48 251 554 115 36 163 616 524 84 190 465 398 89 166 113 330 616 449 90 140 330;15142;764;0 +1573;540;18294;463;1 +9837 13438 13690;351 629 24;26044;351;0 +1708 2675 4935 7401 14413 22177 30319 32217 34342 40235 42963 43949 54816;463 115 474 616 474 44 113 279 164 142 616 649 36;31992;115;0 +8025 11769 36188 42006;142 262 714 142;8209;142;0 +30266;176;44167;692;0 +13000 14769 2940 27638 23158;765 27 736 554 112;55050;725;0 +32557 18668 43441;765 707 396;44217;681;1 +5665 5964 18874;542 746 196;16747;179;0 +7014 29912 42468;194 612 558;20800;355;0 +8320 9743 1735 442 5216 11568;234 251 241 603 476 649;32738;153;0 +533 1447;744 744;17843;744;1 +48390 48191;714 714;48864;708;1 +9312 16166 12754 21433 28142 7486;215 674 241 115 558 241;38629;48;1 +10401 11665 10739;142 364 766;5989;463;0 +10408 14363 8807 14947 24701 44676 40914 12241 14906 29247 32347 5834 18291 18313 23375 24075 7020 14307 15891;140 140 749 281 444 388 504 385 196 339 746 351 463 746 197 90 746 576 476;37949;330;1 +50194;444;15572;216;0 +24021;281;25850;140;1 +22185 28726 55777;142 766 351;17;541;1 +31776 34767 28854 34769 38022 38667 32917 9094 40879 41634 42252 19865 47983 38818 40131 40690 18915 48539 49619 18554 24836;70 239 113 48 486 541 352 197 347 385 34 476 704 388 385 281 225 474 157 706 53;25602;707;1 +10544 15159 23606 33556 46886 55061 2079 27022 40345 43556 3807 28732;642 87 641 113 558 157 564 44 194 26 54 113;51293;272;0 +19005 41469 42368 5739 30169 32266 54743 56959 26271;145 482 707 790 101 347 197 368 674;5602;158;0 +7166 16886 21083 7328 25545;560 213 87 744 87;32494;321;1 +2306;260;30286;179;0 +57709 55115;351 483;25035;142;0 +16641 35845;153 311;36985;68;1 +31144 4107;189 168;50619;142;0 +36331 9873 10659 14382 21430 28164;680 197 185 11 115 476;37887;484;1 +19519 3748 33772 22436 38789 46337;649 351 210 115 113 115;23980;649;1 +30789 37586 42354 26171 15017 28654 44960;142 714 142 483 484 474 157;41552;746;1 +52662;576;53627;776;0 +12258 15133 15681 5066 6420 13421 6577 29202 38939;216 558 111 570 447 5 111 281 347;7818;558;0 +610 1258 2332 7508 10814 10797 11710;543 611 611 653 110 201 179;11495;558;1 +12584 2707 1664 25878 25949;790 694 694 142 611;25286;792;1 +32423 24223;135 90;2323;399;0 +11959;197;15349;351;1 +44448 58138 41930 57603 59009 61316 61559 599;339 629 115 388 1 142 714 297;54434;142;0 +43441 12617 47970 52144;396 196 142 629;29211;351;1 +25327 40258;656 398;40261;142;1 +4637;474;59864;687;0 diff --git a/models/rank/dien/model.py b/models/rank/dien/model.py new file mode 100755 index 0000000000000000000000000000000000000000..4d1e22783bd1fbea045080b57089be9059c7e9f0 --- /dev/null +++ b/models/rank/dien/model.py @@ -0,0 +1,312 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid + +from paddlerec.core.utils import envs +from paddlerec.core.model import ModelBase + + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + + def _init_hyper_parameters(self): + self.item_emb_size = envs.get_global_env( + "hyper_parameters.item_emb_size", 64) + self.cat_emb_size = envs.get_global_env( + "hyper_parameters.cat_emb_size", 64) + self.act = envs.get_global_env("hyper_parameters.act", "sigmoid") + self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", + False) + # significant for speeding up the training process + self.use_DataLoader = envs.get_global_env( + "hyper_parameters.use_DataLoader", False) + self.item_count = envs.get_global_env("hyper_parameters.item_count", + 63001) + self.cat_count = envs.get_global_env("hyper_parameters.cat_count", 801) + + def input_data(self, is_infer=False, **kwargs): + seq_len = -1 + self.data_var = [] + hist_item_seq = fluid.data( + name="hist_item_seq", shape=[None, 1], dtype="int64", lod_level=1) + self.data_var.append(hist_item_seq) + + hist_cat_seq = fluid.data( + name="hist_cat_seq", shape=[None, 1], dtype="int64", lod_level=1) + self.data_var.append(hist_cat_seq) + + target_item = fluid.data( + name="target_item", shape=[None], dtype="int64") + self.data_var.append(target_item) + + target_cat = fluid.data(name="target_cat", shape=[None], dtype="int64") + self.data_var.append(target_cat) + + label = fluid.data(name="label", shape=[None, 1], dtype="float32") + self.data_var.append(label) + + mask = fluid.data( + name="mask", shape=[None, seq_len, 1], dtype="float32") + self.data_var.append(mask) + + target_item_seq = fluid.data( + name="target_item_seq", shape=[None, seq_len], dtype="int64") + self.data_var.append(target_item_seq) + + target_cat_seq = fluid.data( + name="target_cat_seq", shape=[None, seq_len], dtype="int64") + self.data_var.append(target_cat_seq) + + neg_hist_item_seq = fluid.data( + name="neg_hist_item_seq", + shape=[None, 1], + dtype="int64", + lod_level=1) + self.data_var.append(neg_hist_item_seq) + + neg_hist_cat_seq = fluid.data( + name="neg_hist_cat_seq", + shape=[None, 1], + dtype="int64", + lod_level=1) + self.data_var.append(neg_hist_cat_seq) + + train_inputs = [hist_item_seq] + [hist_cat_seq] + [target_item] + [ + target_cat + ] + [label] + [mask] + [target_item_seq] + [target_cat_seq] + [ + neg_hist_item_seq + ] + [neg_hist_cat_seq] + return train_inputs + + def din_attention(self, hist, target_expand, mask, return_alpha=False): + """activation weight""" + + hidden_size = hist.shape[-1] + + concat = fluid.layers.concat( + [hist, target_expand, hist - target_expand, hist * target_expand], + axis=2) + atten_fc1 = fluid.layers.fc(name="atten_fc1", + input=concat, + size=80, + act=self.act, + num_flatten_dims=2) + atten_fc2 = fluid.layers.fc(name="atten_fc2", + input=atten_fc1, + size=40, + act=self.act, + num_flatten_dims=2) + atten_fc3 = fluid.layers.fc(name="atten_fc3", + input=atten_fc2, + size=1, + num_flatten_dims=2) + atten_fc3 += mask + atten_fc3 = fluid.layers.transpose(x=atten_fc3, perm=[0, 2, 1]) + atten_fc3 = fluid.layers.scale(x=atten_fc3, scale=hidden_size**-0.5) + weight = fluid.layers.softmax(atten_fc3) + weighted = fluid.layers.transpose(x=weight, perm=[0, 2, 1]) + weighted_vector = weighted * hist + if return_alpha: + return hist, weighted + return weighted_vector + + def net(self, inputs, is_infer=False): + + # ------------------------- network input -------------------------- + + hist_item_seq = inputs[0] # history item sequence + hist_cat_seq = inputs[1] # history category sequence + target_item = inputs[2] # one dim target item + target_cat = inputs[3] # one dim target category + label = inputs[4] # label + mask = inputs[5] # mask + target_item_seq = inputs[6] # target item expand to sequence + target_cat_seq = inputs[7] # traget category expand to sequence + neg_hist_item_seq = inputs[8] # neg item sampling for aux loss + neg_hist_cat_seq = inputs[9] # neg cat sampling for aux loss + + item_emb_attr = fluid.ParamAttr(name="item_emb") + cat_emb_attr = fluid.ParamAttr(name="cat_emb") + + # ------------------------- Embedding Layer -------------------------- + + hist_item_emb = fluid.embedding( + input=hist_item_seq, + size=[self.item_count, self.item_emb_size], + param_attr=item_emb_attr, + is_sparse=self.is_sparse) + + neg_hist_cat_emb = fluid.embedding( + input=neg_hist_cat_seq, + size=[self.cat_count, self.cat_emb_size], + param_attr=cat_emb_attr, + is_sparse=self.is_sparse) + + neg_hist_item_emb = fluid.embedding( + input=neg_hist_item_seq, + size=[self.item_count, self.item_emb_size], + param_attr=item_emb_attr, + is_sparse=self.is_sparse) + + hist_cat_emb = fluid.embedding( + input=hist_cat_seq, + size=[self.cat_count, self.cat_emb_size], + param_attr=cat_emb_attr, + is_sparse=self.is_sparse) + + target_item_emb = fluid.embedding( + input=target_item, + size=[self.item_count, self.item_emb_size], + param_attr=item_emb_attr, + is_sparse=self.is_sparse) + + target_cat_emb = fluid.embedding( + input=target_cat, + size=[self.cat_count, self.cat_emb_size], + param_attr=cat_emb_attr, + is_sparse=self.is_sparse) + + target_item_seq_emb = fluid.embedding( + input=target_item_seq, + size=[self.item_count, self.item_emb_size], + param_attr=item_emb_attr, + is_sparse=self.is_sparse) + + target_cat_seq_emb = fluid.embedding( + input=target_cat_seq, + size=[self.cat_count, self.cat_emb_size], + param_attr=cat_emb_attr, + is_sparse=self.is_sparse) + + item_b = fluid.embedding( + input=target_item, + size=[self.item_count, 1], + param_attr=fluid.initializer.Constant(value=0.0)) + + # ------------------------- Interest Extractor Layer -------------------------- + + hist_seq_concat = fluid.layers.concat( + [hist_item_emb, hist_cat_emb], axis=2) + neg_hist_seq_concat = fluid.layers.concat( + [neg_hist_item_emb, neg_hist_cat_emb], axis=2) + target_seq_concat = fluid.layers.concat( + [target_item_seq_emb, target_cat_seq_emb], axis=2) + target_concat = fluid.layers.concat( + [target_item_emb, target_cat_emb], axis=1) + + reshape_hist_item_emb = fluid.layers.reduce_sum(hist_seq_concat, dim=1) + neg_reshape_hist_item_emb = fluid.layers.reduce_sum( + neg_hist_seq_concat, dim=1) + gru_input_hist_item_emb = fluid.layers.concat( + [reshape_hist_item_emb] * 3, axis=1) + + gru_h1 = fluid.layers.dynamic_gru( + gru_input_hist_item_emb, size=self.item_emb_size * 2) + gru_h1_input = fluid.layers.concat([gru_h1] * 3, axis=1) + gru_h2 = fluid.layers.dynamic_gru( + gru_h1_input, size=self.item_emb_size * 2) + + # ------------------------- Auxiliary loss -------------------------- + + pad_value = fluid.layers.zeros(shape=[1], dtype='float32') + start_value = fluid.layers.zeros(shape=[1], dtype='int32') + gru_out_pad, lengths = fluid.layers.sequence_pad(gru_h2, pad_value) + pos_seq_pad, _ = fluid.layers.sequence_pad(reshape_hist_item_emb, + pad_value) + neg_seq_pad, _ = fluid.layers.sequence_pad(neg_reshape_hist_item_emb, + pad_value) + seq_shape = fluid.layers.shape(pos_seq_pad) + test_pos = fluid.layers.reduce_sum( + fluid.layers.reduce_sum( + fluid.layers.log( + fluid.layers.sigmoid( + fluid.layers.reduce_sum( + gru_out_pad[:, start_value:seq_shape[1] - 1, :] * + pos_seq_pad[:, start_value + 1:seq_shape[1], :], + dim=2, + keep_dim=True))), + dim=2), + dim=1, + keep_dim=True) + test_neg = fluid.layers.reduce_sum( + fluid.layers.reduce_sum( + fluid.layers.log( + fluid.layers.sigmoid( + fluid.layers.reduce_sum( + gru_out_pad[:, start_value:seq_shape[1] - 1, :] * + neg_seq_pad[:, start_value + 1:seq_shape[1], :], + dim=2, + keep_dim=True))), + dim=2), + dim=1, + keep_dim=True) + aux_loss = fluid.layers.mean(test_neg + test_pos) + + # ------------------------- Interest Evolving Layer (GRU with attentional input (AIGRU)) -------------------------- + + weighted_vector = self.din_attention(gru_out_pad, target_seq_concat, + mask) + weighted_vector = fluid.layers.transpose(weighted_vector, [1, 0, 2]) + concat_weighted_vector = fluid.layers.concat( + [weighted_vector] * 3, axis=2) + + attention_rnn = fluid.layers.StaticRNN(name="attnention_evolution") + + with attention_rnn.step(): + word = attention_rnn.step_input(concat_weighted_vector) + prev = attention_rnn.memory( + shape=[-1, self.item_emb_size * 2], batch_ref=word) + hidden, _, _ = fluid.layers.gru_unit( + input=word, hidden=prev, size=self.item_emb_size * 6) + attention_rnn.update_memory(prev, hidden) + attention_rnn.output(hidden) + + attention_rnn_res = attention_rnn() + attention_rnn_res_T = fluid.layers.transpose(attention_rnn_res, + [1, 0, 2])[:, -1, :] + + out = fluid.layers.sequence_pool(input=hist_item_emb, pool_type='sum') + out_fc = fluid.layers.fc(name="out_fc", + input=out, + size=self.item_emb_size + self.cat_emb_size, + num_flatten_dims=1) + embedding_concat = fluid.layers.concat( + [attention_rnn_res_T, target_concat], axis=1) + + fc1 = fluid.layers.fc(name="fc1", + input=embedding_concat, + size=80, + act=self.act) + fc2 = fluid.layers.fc(name="fc2", input=fc1, size=40, act=self.act) + fc3 = fluid.layers.fc(name="fc3", input=fc2, size=1) + logit = fc3 + item_b + + loss = fluid.layers.sigmoid_cross_entropy_with_logits( + x=logit, label=label) + + avg_loss = fluid.layers.mean(loss) + aux_loss + self._cost = avg_loss + + self.predict = fluid.layers.sigmoid(logit) + predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1) + label_int = fluid.layers.cast(label, 'int64') + auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d, + label=label_int, + slide_steps=0) + self._metrics["AUC"] = auc_var + self._metrics["BATCH_AUC"] = batch_auc_var + if is_infer: + self._infer_results["AUC"] = auc_var diff --git a/models/rank/dien/reader.py b/models/rank/dien/reader.py new file mode 100755 index 0000000000000000000000000000000000000000..fecc9c4c4948157341fa74c42469f26fddb2deae --- /dev/null +++ b/models/rank/dien/reader.py @@ -0,0 +1,176 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import os +import random + +try: + import cPickle as pickle +except ImportError: + import pickle + +import numpy as np + +from paddlerec.core.reader import ReaderBase +from paddlerec.core.utils import envs + + +class Reader(ReaderBase): + def init(self): + self.train_data_path = envs.get_global_env( + "dataset.sample_1.data_path", None) + self.res = [] + self.max_len = 0 + self.neg_candidate_item = [] + self.neg_candidate_cat = [] + self.max_neg_item = 10000 + self.max_neg_cat = 1000 + + data_file_list = os.listdir(self.train_data_path) + for i in range(0, len(data_file_list)): + train_data_file = os.path.join(self.train_data_path, + data_file_list[i]) + with open(train_data_file, "r") as fin: + for line in fin: + line = line.strip().split(';') + hist = line[0].split() + self.max_len = max(self.max_len, len(hist)) + fo = open("tmp.txt", "w") + fo.write(str(self.max_len)) + fo.close() + self.batch_size = envs.get_global_env("dataset.sample_1.batch_size", + 32, "train.reader") + self.group_size = self.batch_size * 20 + + def _process_line(self, line): + line = line.strip().split(';') + hist = line[0].split() + hist = [int(i) for i in hist] + cate = line[1].split() + cate = [int(i) for i in cate] + return [hist, cate, [int(line[2])], [int(line[3])], [float(line[4])]] + + def generate_sample(self, line): + """ + Read the data line by line and process it as a dictionary + """ + + def data_iter(): + # feat_idx, feat_value, label = self._process_line(line) + yield self._process_line(line) + + return data_iter + + def pad_batch_data(self, input, max_len): + res = np.array([x + [0] * (max_len - len(x)) for x in input]) + res = res.astype("int64").reshape([-1, max_len]) + return res + + def make_data(self, b): + max_len = max(len(x[0]) for x in b) + # item = self.pad_batch_data([x[0] for x in b], max_len) + # cat = self.pad_batch_data([x[1] for x in b], max_len) + item = [x[0] for x in b] + cat = [x[1] for x in b] + neg_item = [None] * len(item) + neg_cat = [None] * len(cat) + + for i in range(len(b)): + neg_item[i] = [] + neg_cat[i] = [] + if len(self.neg_candidate_item) < self.max_neg_item: + self.neg_candidate_item.extend(b[i][0]) + if len(self.neg_candidate_item) > self.max_neg_item: + self.neg_candidate_item = self.neg_candidate_item[ + 0:self.max_neg_item] + else: + len_seq = len(b[i][0]) + start_idx = random.randint(0, self.max_neg_item - len_seq - 1) + self.neg_candidate_item[start_idx:start_idx + len_seq + 1] = b[ + i][0] + + if len(self.neg_candidate_cat) < self.max_neg_cat: + self.neg_candidate_cat.extend(b[i][1]) + if len(self.neg_candidate_cat) > self.max_neg_cat: + self.neg_candidate_cat = self.neg_candidate_cat[ + 0:self.max_neg_cat] + else: + len_seq = len(b[i][1]) + start_idx = random.randint(0, self.max_neg_cat - len_seq - 1) + self.neg_candidate_item[start_idx:start_idx + len_seq + 1] = b[ + i][1] + for _ in range(len(b[i][0])): + neg_item[i].append(self.neg_candidate_item[random.randint( + 0, len(self.neg_candidate_item) - 1)]) + for _ in range(len(b[i][1])): + neg_cat[i].append(self.neg_candidate_cat[random.randint( + 0, len(self.neg_candidate_cat) - 1)]) + + len_array = [len(x[0]) for x in b] + mask = np.array( + [[0] * x + [-1e9] * (max_len - x) for x in len_array]).reshape( + [-1, max_len, 1]) + target_item_seq = np.array( + [[x[2]] * max_len for x in b]).astype("int64").reshape( + [-1, max_len]) + target_cat_seq = np.array( + [[x[3]] * max_len for x in b]).astype("int64").reshape( + [-1, max_len]) + res = [] + for i in range(len(b)): + res.append([ + item[i], cat[i], b[i][2], b[i][3], b[i][4], mask[i], + target_item_seq[i], target_cat_seq[i], neg_item[i], neg_cat[i] + ]) + return res + + def batch_reader(self, reader, batch_size, group_size): + def batch_reader(): + bg = [] + for line in reader: + bg.append(line) + if len(bg) == group_size: + sortb = sorted(bg, key=lambda x: len(x[0]), reverse=False) + bg = [] + for i in range(0, group_size, batch_size): + b = sortb[i:i + batch_size] + yield self.make_data(b) + len_bg = len(bg) + if len_bg != 0: + sortb = sorted(bg, key=lambda x: len(x[0]), reverse=False) + bg = [] + remain = len_bg % batch_size + for i in range(0, len_bg - remain, batch_size): + b = sortb[i:i + batch_size] + yield self.make_data(b) + + return batch_reader + + def base_read(self, file_dir): + res = [] + for train_file in file_dir: + with open(train_file, "r") as fin: + for line in fin: + line = line.strip().split(';') + hist = line[0].split() + cate = line[1].split() + res.append([hist, cate, line[2], line[3], float(line[4])]) + return res + + def generate_batch_from_trainfiles(self, files): + data_set = self.base_read(files) + random.shuffle(data_set) + return self.batch_reader(data_set, self.batch_size, + self.batch_size * 20) diff --git a/models/rank/readme.md b/models/rank/readme.md index b1939c9d3776eab86b48589698baa516130e9f60..18da95aa5f604311038f4e5e8c704b834fa6f275 100644 --- a/models/rank/readme.md +++ b/models/rank/readme.md @@ -36,6 +36,7 @@ | AFM | Attentional Factorization Machines | [Attentional Factorization Machines: Learning the Weight of Feature Interactions via Attention Networks](https://arxiv.org/pdf/1708.04617.pdf)(2017) | | xDeepFM | xDeepFM | [xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems](https://dl.acm.org/doi/pdf/10.1145/3219819.3220023)(2018) | | DIN | Deep Interest Network | [Deep Interest Network for Click-Through Rate Prediction](https://dl.acm.org/doi/pdf/10.1145/3219819.3219823)(2018) | +| DIEN | Deep Interest Evolution Network | [Deep Interest Evolution Network for Click-Through Rate Prediction](https://www.aaai.org/ojs/index.php/AAAI/article/view/4545/4423)(2019) | | FGCNN | Feature Generation by CNN | [Feature Generation by Convolutional Neural Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1904.04447.pdf)(2019) | | FIBINET | Combining Feature Importance and Bilinear feature Interaction | [《FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction》]( https://arxiv.org/pdf/1905.09433.pdf)(2019) |