From c4ff2799f96198d713e66a4849d6009fbdbf188c Mon Sep 17 00:00:00 2001 From: zhang wenhui Date: Sat, 10 Oct 2020 13:16:36 +0800 Subject: [PATCH] Delete PaddleRec model (#4872) * update api 1.8 * fix paddlerec readme * delete some model * delete some model * update --- PaddleRec/ctr/README.md | 7 - PaddleRec/ctr/dcn/README.md | 104 +- PaddleRec/ctr/dcn/config.py | 85 -- PaddleRec/ctr/dcn/data/download.py | 24 - PaddleRec/ctr/dcn/data/preprocess.py | 131 --- PaddleRec/ctr/dcn/dist_data/dist_download.py | 19 - .../ctr/dcn/dist_data/dist_preprocess.py | 122 -- PaddleRec/ctr/dcn/network.py | 143 --- PaddleRec/ctr/dcn/reader.py | 94 -- PaddleRec/ctr/dcn/utils.py | 24 - PaddleRec/ctr/deepfm/README.md | 98 +- PaddleRec/ctr/deepfm/args.py | 73 -- .../deepfm/data/aid_data/train_file_idx.txt | 1 - .../ctr/deepfm/data/download_preprocess.py | 25 - PaddleRec/ctr/deepfm/data/preprocess.py | 101 -- .../deepfm/dist_data/dist_data_download.py | 22 - .../ctr/deepfm/dist_data/preprocess_dist.py | 75 -- PaddleRec/ctr/deepfm/network_conf.py | 115 -- .../ctr/deepfm/picture/deepfm_result.png | Bin 24149 -> 0 bytes PaddleRec/ctr/deepfm/utils.py | 24 - .../data/aid_data/train_file_idx.txt | 1 - .../data/download_preprocess.py | 27 - .../ctr/deepfm_dygraph/data/preprocess.py | 120 -- PaddleRec/ctr/deepfm_dygraph/network.py | 75 +- PaddleRec/ctr/deepfm_dygraph/train.py | 238 ++-- PaddleRec/ctr/din/README.md | 118 +- PaddleRec/ctr/din/__init__.py | 0 PaddleRec/ctr/din/_ce.py | 62 - PaddleRec/ctr/din/data/build_dataset.py | 87 -- PaddleRec/ctr/din/data/convert_pd.py | 27 - PaddleRec/ctr/din/data/data_process.sh | 15 - PaddleRec/ctr/din/data/remap_id.py | 48 - PaddleRec/ctr/din/network.py | 140 --- PaddleRec/ctr/din/reader.py | 96 -- PaddleRec/ctr/dnn/README.md | 783 +------------ PaddleRec/ctr/dnn/__init__.py | 0 PaddleRec/ctr/dnn/dataset_generator.py | 65 -- PaddleRec/ctr/dnn/download_data.sh | 13 - PaddleRec/ctr/dnn/feed_generator.py | 69 -- PaddleRec/ctr/dnn/infer.py | 184 --- PaddleRec/ctr/dnn/local_cluster.sh | 70 -- PaddleRec/ctr/dnn/network_conf.py | 93 -- PaddleRec/ctr/dnn/train.py | 279 ----- PaddleRec/ctr/dnn/utils.py | 24 - PaddleRec/ctr/wide_deep/README.md | 181 +-- PaddleRec/ctr/wide_deep/args.py | 42 - PaddleRec/ctr/wide_deep/create_data.sh | 17 - PaddleRec/ctr/wide_deep/data_preparation.py | 109 -- PaddleRec/ctr/wide_deep/infer_cpu.sh | 9 - PaddleRec/ctr/wide_deep/infer_gpu.sh | 9 - PaddleRec/ctr/wide_deep/net.py | 70 -- PaddleRec/ctr/wide_deep/requirements.txt | 132 --- PaddleRec/ctr/wide_deep/train_cpu.sh | 9 - PaddleRec/ctr/wide_deep/train_gpu.sh | 8 - PaddleRec/ctr/wide_deep/utils.py | 31 - PaddleRec/ctr/xdeepfm/README.md | 68 +- PaddleRec/ctr/xdeepfm/args.py | 81 -- PaddleRec/ctr/xdeepfm/data/download.py | 28 - PaddleRec/ctr/xdeepfm/network_conf.py | 138 --- PaddleRec/ctr/xdeepfm/utils.py | 24 - PaddleRec/dssm/README.md | 140 +-- PaddleRec/dssm/args.py | 36 - PaddleRec/dssm/dssm.py | 119 -- PaddleRec/dssm/infer_cpu.sh | 2 - PaddleRec/dssm/infer_gpu.sh | 2 - PaddleRec/dssm/train_cpu.sh | 9 - PaddleRec/dssm/train_gpu.sh | 9 - PaddleRec/gnn/README.md | 114 +- PaddleRec/gnn/__init__.py | 0 PaddleRec/gnn/_ce.py | 60 - PaddleRec/gnn/data/download.py | 47 - PaddleRec/gnn/data/preprocess.py | 256 ----- PaddleRec/gnn/network.py | 192 ---- PaddleRec/gnn/reader.py | 123 -- PaddleRec/gru4rec/README.md | 262 +---- PaddleRec/gru4rec/__init__.py | 0 PaddleRec/gru4rec/_ce.py | 66 -- PaddleRec/gru4rec/convert_format.py | 33 - PaddleRec/gru4rec/net.py | 219 ---- PaddleRec/gru4rec/test_data/small_test.txt | 100 -- PaddleRec/gru4rec/text2paddle.py | 101 -- PaddleRec/gru4rec/train_data/small_train.txt | 100 -- PaddleRec/gru4rec/utils.py | 211 ---- PaddleRec/gru4rec/vocab.txt | 1 - PaddleRec/multi_task/esmm/README.md | 134 +-- PaddleRec/multi_task/esmm/args.py | 44 - PaddleRec/multi_task/esmm/cpu_infer.sh | 5 - PaddleRec/multi_task/esmm/cpu_train.sh | 8 - .../multi_task/esmm/dataset_generator.py | 44 - PaddleRec/multi_task/esmm/get_data.sh | 26 - PaddleRec/multi_task/esmm/gpu_infer.sh | 4 - PaddleRec/multi_task/esmm/gpu_train.sh | 8 - PaddleRec/multi_task/esmm/net.py | 83 -- PaddleRec/multi_task/esmm/reader.py | 157 --- PaddleRec/multi_task/esmm/utils.py | 33 - PaddleRec/multi_task/mmoe/README.md | 136 +-- PaddleRec/multi_task/mmoe/args.py | 46 - PaddleRec/multi_task/mmoe/create_data.sh | 16 - PaddleRec/multi_task/mmoe/data_preparation.py | 104 -- PaddleRec/multi_task/mmoe/mmoe_train.py | 177 --- PaddleRec/multi_task/mmoe/requirements.txt | 1 - PaddleRec/multi_task/mmoe/train_cpu.sh | 11 - PaddleRec/multi_task/mmoe/train_gpu.sh | 11 - PaddleRec/multi_task/mmoe/utils.py | 48 - PaddleRec/multi_task/share_bottom/README.md | 132 +-- PaddleRec/multi_task/share_bottom/args.py | 46 - .../multi_task/share_bottom/create_data.sh | 16 - .../share_bottom/data_preparation.py | 103 -- .../multi_task/share_bottom/requirements.txt | 1 - .../multi_task/share_bottom/share_bottom.py | 163 --- .../multi_task/share_bottom/train_cpu.sh | 13 - .../multi_task/share_bottom/train_gpu.sh | 13 - PaddleRec/multi_task/share_bottom/utils.py | 48 - PaddleRec/multiview_simnet/README.cn.md | 29 - PaddleRec/multiview_simnet/README.md | 34 +- PaddleRec/multiview_simnet/__init__.py | 0 PaddleRec/multiview_simnet/_ce.py | 58 - PaddleRec/multiview_simnet/nets.py | 257 ----- PaddleRec/multiview_simnet/reader.py | 67 -- PaddleRec/ncf/Dataset.py | 35 - PaddleRec/ncf/README.md | 130 +-- PaddleRec/ncf/args.py | 24 - PaddleRec/ncf/create_data.sh | 6 - PaddleRec/ncf/evaluate.py | 122 -- PaddleRec/ncf/get_train_data.py | 56 - PaddleRec/ncf/gmf.py | 31 - PaddleRec/ncf/mlp.py | 44 - PaddleRec/ncf/neumf.py | 64 -- PaddleRec/ncf/requirements.txt | 132 --- PaddleRec/ncf/train_cpu.sh | 8 - PaddleRec/ncf/train_gpu.sh | 8 - PaddleRec/ncf/utils.py | 39 - PaddleRec/rerank/listwise/README.md | 126 +-- PaddleRec/rerank/listwise/args.py | 39 - PaddleRec/rerank/listwise/evaluator.py | 118 -- PaddleRec/rerank/listwise/infer_cpu.sh | 1 - PaddleRec/rerank/listwise/infer_gpu.sh | 1 - PaddleRec/rerank/listwise/train_cpu.sh | 1 - PaddleRec/rerank/listwise/train_gpu.sh | 1 - PaddleRec/rerank/listwise/utils.py | 36 - PaddleRec/ssr/README.md | 52 +- PaddleRec/ssr/__init__.py | 0 PaddleRec/ssr/_ce.py | 66 -- PaddleRec/ssr/infer.py | 136 --- PaddleRec/ssr/nets.py | 127 --- PaddleRec/ssr/reader.py | 90 -- PaddleRec/ssr/test_data/small_test.txt | 100 -- PaddleRec/ssr/train.py | 174 --- PaddleRec/ssr/train_data/small_train.txt | 100 -- PaddleRec/ssr/utils.py | 66 -- PaddleRec/ssr/vocab.txt | 1 - PaddleRec/tagspace/README.md | 83 +- PaddleRec/tagspace/__init.py__ | 0 PaddleRec/tagspace/_ce.py | 66 -- PaddleRec/tagspace/net.py | 64 -- PaddleRec/tagspace/test_data/small_test.csv | 1000 ----------------- PaddleRec/tagspace/text2paddle.py | 91 -- PaddleRec/tagspace/train_data/small_train.csv | 1000 ----------------- PaddleRec/tagspace/utils.py | 198 ---- PaddleRec/tagspace/vocab_tag.txt | 1 - PaddleRec/tagspace/vocab_text.txt | 1 - PaddleRec/tdm/README.md | 33 +- PaddleRec/tdm/tdm_demo/README.md | 709 ------------ PaddleRec/tdm/tdm_demo/args.py | 145 --- PaddleRec/tdm/tdm_demo/async_train.sh | 38 - .../data/test/demo_fake_test_data.txt | 10 - .../data/train/demo_fake_train_data.txt | 10 - PaddleRec/tdm/tdm_demo/img/demo_network.png | Bin 1239759 -> 0 bytes PaddleRec/tdm/tdm_demo/img/demo_tree.png | Bin 537496 -> 0 bytes PaddleRec/tdm/tdm_demo/img/dnn-net.png | Bin 585019 -> 0 bytes PaddleRec/tdm/tdm_demo/img/input-net.png | Bin 273636 -> 0 bytes PaddleRec/tdm/tdm_demo/infer_network.py | 195 ---- PaddleRec/tdm/tdm_demo/local_cluster.sh | 69 -- PaddleRec/tdm/tdm_demo/run_infer.sh | 32 - PaddleRec/tdm/tdm_demo/run_predict.sh | 32 - PaddleRec/tdm/tdm_demo/run_train.sh | 33 - .../tdm/tdm_demo/thirdparty/layer_list.txt | 4 - .../tdm/tdm_demo/thirdparty/travel_list.txt | 13 - .../tdm/tdm_demo/thirdparty/tree_emb.txt | 26 - .../tdm/tdm_demo/thirdparty/tree_info.txt | 26 - PaddleRec/tdm/tdm_demo/train_network.py | 365 ------ PaddleRec/tdm/tdm_demo/utils.py | 126 --- PaddleRec/text_classification/README.md | 38 +- PaddleRec/text_classification/net.py | 39 - PaddleRec/text_classification/train.py | 31 - PaddleRec/word2vec/README.md | 108 +- PaddleRec/word2vec/net.py | 226 ---- PaddleRec/word2vec/preprocess.py | 201 ---- PaddleRec/word2vec/reader.py | 106 -- PaddleRec/word2vec/utils.py | 117 -- PaddleRec/youbube_dnn/README.md | 135 +-- PaddleRec/youbube_dnn/args.py | 43 - PaddleRec/youbube_dnn/get_topk.py | 34 - PaddleRec/youbube_dnn/infer_cpu.sh | 1 - PaddleRec/youbube_dnn/infer_gpu.sh | 1 - PaddleRec/youbube_dnn/rec_topk.sh | 1 - PaddleRec/youbube_dnn/train_cpu.sh | 1 - PaddleRec/youbube_dnn/train_gpu.sh | 1 - PaddleRec/youbube_dnn/youtubednn.py | 52 - dygraph/mnist/train.py | 250 ++--- 200 files changed, 310 insertions(+), 16534 deletions(-) delete mode 100644 PaddleRec/ctr/README.md delete mode 100644 PaddleRec/ctr/dcn/config.py delete mode 100644 PaddleRec/ctr/dcn/data/download.py delete mode 100644 PaddleRec/ctr/dcn/data/preprocess.py delete mode 100644 PaddleRec/ctr/dcn/dist_data/dist_download.py delete mode 100644 PaddleRec/ctr/dcn/dist_data/dist_preprocess.py delete mode 100644 PaddleRec/ctr/dcn/network.py delete mode 100644 PaddleRec/ctr/dcn/reader.py delete mode 100644 PaddleRec/ctr/dcn/utils.py delete mode 100644 PaddleRec/ctr/deepfm/args.py delete mode 100644 PaddleRec/ctr/deepfm/data/aid_data/train_file_idx.txt delete mode 100644 PaddleRec/ctr/deepfm/data/download_preprocess.py delete mode 100644 PaddleRec/ctr/deepfm/data/preprocess.py delete mode 100644 PaddleRec/ctr/deepfm/dist_data/dist_data_download.py delete mode 100644 PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py delete mode 100644 PaddleRec/ctr/deepfm/network_conf.py delete mode 100644 PaddleRec/ctr/deepfm/picture/deepfm_result.png delete mode 100644 PaddleRec/ctr/deepfm/utils.py delete mode 100644 PaddleRec/ctr/deepfm_dygraph/data/aid_data/train_file_idx.txt delete mode 100644 PaddleRec/ctr/deepfm_dygraph/data/download_preprocess.py delete mode 100644 PaddleRec/ctr/deepfm_dygraph/data/preprocess.py delete mode 100644 PaddleRec/ctr/din/__init__.py delete mode 100644 PaddleRec/ctr/din/_ce.py delete mode 100644 PaddleRec/ctr/din/data/build_dataset.py delete mode 100644 PaddleRec/ctr/din/data/convert_pd.py delete mode 100644 PaddleRec/ctr/din/data/data_process.sh delete mode 100644 PaddleRec/ctr/din/data/remap_id.py delete mode 100644 PaddleRec/ctr/din/network.py delete mode 100644 PaddleRec/ctr/din/reader.py delete mode 100644 PaddleRec/ctr/dnn/__init__.py delete mode 100644 PaddleRec/ctr/dnn/dataset_generator.py delete mode 100644 PaddleRec/ctr/dnn/download_data.sh delete mode 100644 PaddleRec/ctr/dnn/feed_generator.py delete mode 100644 PaddleRec/ctr/dnn/infer.py delete mode 100644 PaddleRec/ctr/dnn/local_cluster.sh delete mode 100644 PaddleRec/ctr/dnn/network_conf.py delete mode 100644 PaddleRec/ctr/dnn/train.py delete mode 100644 PaddleRec/ctr/dnn/utils.py delete mode 100644 PaddleRec/ctr/wide_deep/args.py delete mode 100644 PaddleRec/ctr/wide_deep/create_data.sh delete mode 100644 PaddleRec/ctr/wide_deep/data_preparation.py delete mode 100644 PaddleRec/ctr/wide_deep/infer_cpu.sh delete mode 100644 PaddleRec/ctr/wide_deep/infer_gpu.sh delete mode 100644 PaddleRec/ctr/wide_deep/net.py delete mode 100644 PaddleRec/ctr/wide_deep/requirements.txt delete mode 100644 PaddleRec/ctr/wide_deep/train_cpu.sh delete mode 100644 PaddleRec/ctr/wide_deep/train_gpu.sh delete mode 100644 PaddleRec/ctr/wide_deep/utils.py delete mode 100644 PaddleRec/ctr/xdeepfm/args.py delete mode 100644 PaddleRec/ctr/xdeepfm/data/download.py delete mode 100644 PaddleRec/ctr/xdeepfm/network_conf.py delete mode 100644 PaddleRec/ctr/xdeepfm/utils.py delete mode 100644 PaddleRec/dssm/args.py delete mode 100644 PaddleRec/dssm/dssm.py delete mode 100644 PaddleRec/dssm/infer_cpu.sh delete mode 100644 PaddleRec/dssm/infer_gpu.sh delete mode 100644 PaddleRec/dssm/train_cpu.sh delete mode 100644 PaddleRec/dssm/train_gpu.sh delete mode 100644 PaddleRec/gnn/__init__.py delete mode 100644 PaddleRec/gnn/_ce.py delete mode 100644 PaddleRec/gnn/data/download.py delete mode 100755 PaddleRec/gnn/data/preprocess.py delete mode 100644 PaddleRec/gnn/network.py delete mode 100644 PaddleRec/gnn/reader.py delete mode 100644 PaddleRec/gru4rec/__init__.py delete mode 100644 PaddleRec/gru4rec/_ce.py delete mode 100644 PaddleRec/gru4rec/convert_format.py delete mode 100644 PaddleRec/gru4rec/net.py delete mode 100644 PaddleRec/gru4rec/test_data/small_test.txt delete mode 100644 PaddleRec/gru4rec/text2paddle.py delete mode 100644 PaddleRec/gru4rec/train_data/small_train.txt delete mode 100644 PaddleRec/gru4rec/utils.py delete mode 100644 PaddleRec/gru4rec/vocab.txt delete mode 100644 PaddleRec/multi_task/esmm/args.py delete mode 100644 PaddleRec/multi_task/esmm/cpu_infer.sh delete mode 100644 PaddleRec/multi_task/esmm/cpu_train.sh delete mode 100644 PaddleRec/multi_task/esmm/dataset_generator.py delete mode 100644 PaddleRec/multi_task/esmm/get_data.sh delete mode 100644 PaddleRec/multi_task/esmm/gpu_infer.sh delete mode 100644 PaddleRec/multi_task/esmm/gpu_train.sh delete mode 100644 PaddleRec/multi_task/esmm/net.py delete mode 100644 PaddleRec/multi_task/esmm/reader.py delete mode 100644 PaddleRec/multi_task/esmm/utils.py delete mode 100644 PaddleRec/multi_task/mmoe/args.py delete mode 100644 PaddleRec/multi_task/mmoe/create_data.sh delete mode 100644 PaddleRec/multi_task/mmoe/data_preparation.py delete mode 100644 PaddleRec/multi_task/mmoe/mmoe_train.py delete mode 100644 PaddleRec/multi_task/mmoe/requirements.txt delete mode 100644 PaddleRec/multi_task/mmoe/train_cpu.sh delete mode 100644 PaddleRec/multi_task/mmoe/train_gpu.sh delete mode 100644 PaddleRec/multi_task/mmoe/utils.py delete mode 100644 PaddleRec/multi_task/share_bottom/args.py delete mode 100644 PaddleRec/multi_task/share_bottom/create_data.sh delete mode 100644 PaddleRec/multi_task/share_bottom/data_preparation.py delete mode 100644 PaddleRec/multi_task/share_bottom/requirements.txt delete mode 100644 PaddleRec/multi_task/share_bottom/share_bottom.py delete mode 100644 PaddleRec/multi_task/share_bottom/train_cpu.sh delete mode 100644 PaddleRec/multi_task/share_bottom/train_gpu.sh delete mode 100644 PaddleRec/multi_task/share_bottom/utils.py delete mode 100644 PaddleRec/multiview_simnet/README.cn.md delete mode 100644 PaddleRec/multiview_simnet/__init__.py delete mode 100644 PaddleRec/multiview_simnet/_ce.py delete mode 100644 PaddleRec/multiview_simnet/nets.py delete mode 100644 PaddleRec/multiview_simnet/reader.py delete mode 100644 PaddleRec/ncf/Dataset.py delete mode 100644 PaddleRec/ncf/args.py delete mode 100644 PaddleRec/ncf/create_data.sh delete mode 100644 PaddleRec/ncf/evaluate.py delete mode 100644 PaddleRec/ncf/get_train_data.py delete mode 100644 PaddleRec/ncf/gmf.py delete mode 100644 PaddleRec/ncf/mlp.py delete mode 100644 PaddleRec/ncf/neumf.py delete mode 100644 PaddleRec/ncf/requirements.txt delete mode 100644 PaddleRec/ncf/train_cpu.sh delete mode 100644 PaddleRec/ncf/train_gpu.sh delete mode 100644 PaddleRec/ncf/utils.py delete mode 100644 PaddleRec/rerank/listwise/args.py delete mode 100644 PaddleRec/rerank/listwise/evaluator.py delete mode 100644 PaddleRec/rerank/listwise/infer_cpu.sh delete mode 100644 PaddleRec/rerank/listwise/infer_gpu.sh delete mode 100644 PaddleRec/rerank/listwise/train_cpu.sh delete mode 100644 PaddleRec/rerank/listwise/train_gpu.sh delete mode 100644 PaddleRec/rerank/listwise/utils.py delete mode 100644 PaddleRec/ssr/__init__.py delete mode 100644 PaddleRec/ssr/_ce.py delete mode 100644 PaddleRec/ssr/infer.py delete mode 100644 PaddleRec/ssr/nets.py delete mode 100644 PaddleRec/ssr/reader.py delete mode 100644 PaddleRec/ssr/test_data/small_test.txt delete mode 100644 PaddleRec/ssr/train.py delete mode 100644 PaddleRec/ssr/train_data/small_train.txt delete mode 100644 PaddleRec/ssr/utils.py delete mode 100644 PaddleRec/ssr/vocab.txt delete mode 100644 PaddleRec/tagspace/__init.py__ delete mode 100644 PaddleRec/tagspace/_ce.py delete mode 100644 PaddleRec/tagspace/net.py delete mode 100644 PaddleRec/tagspace/test_data/small_test.csv delete mode 100644 PaddleRec/tagspace/text2paddle.py delete mode 100644 PaddleRec/tagspace/train_data/small_train.csv delete mode 100644 PaddleRec/tagspace/utils.py delete mode 100644 PaddleRec/tagspace/vocab_tag.txt delete mode 100644 PaddleRec/tagspace/vocab_text.txt delete mode 100644 PaddleRec/tdm/tdm_demo/README.md delete mode 100644 PaddleRec/tdm/tdm_demo/args.py delete mode 100644 PaddleRec/tdm/tdm_demo/async_train.sh delete mode 100644 PaddleRec/tdm/tdm_demo/data/test/demo_fake_test_data.txt delete mode 100644 PaddleRec/tdm/tdm_demo/data/train/demo_fake_train_data.txt delete mode 100644 PaddleRec/tdm/tdm_demo/img/demo_network.png delete mode 100644 PaddleRec/tdm/tdm_demo/img/demo_tree.png delete mode 100644 PaddleRec/tdm/tdm_demo/img/dnn-net.png delete mode 100644 PaddleRec/tdm/tdm_demo/img/input-net.png delete mode 100644 PaddleRec/tdm/tdm_demo/infer_network.py delete mode 100644 PaddleRec/tdm/tdm_demo/local_cluster.sh delete mode 100644 PaddleRec/tdm/tdm_demo/run_infer.sh delete mode 100644 PaddleRec/tdm/tdm_demo/run_predict.sh delete mode 100644 PaddleRec/tdm/tdm_demo/run_train.sh delete mode 100644 PaddleRec/tdm/tdm_demo/thirdparty/layer_list.txt delete mode 100644 PaddleRec/tdm/tdm_demo/thirdparty/travel_list.txt delete mode 100644 PaddleRec/tdm/tdm_demo/thirdparty/tree_emb.txt delete mode 100644 PaddleRec/tdm/tdm_demo/thirdparty/tree_info.txt delete mode 100644 PaddleRec/tdm/tdm_demo/train_network.py delete mode 100644 PaddleRec/tdm/tdm_demo/utils.py delete mode 100644 PaddleRec/text_classification/net.py delete mode 100644 PaddleRec/text_classification/train.py delete mode 100644 PaddleRec/word2vec/net.py delete mode 100644 PaddleRec/word2vec/preprocess.py delete mode 100644 PaddleRec/word2vec/reader.py delete mode 100644 PaddleRec/word2vec/utils.py delete mode 100644 PaddleRec/youbube_dnn/args.py delete mode 100644 PaddleRec/youbube_dnn/get_topk.py delete mode 100644 PaddleRec/youbube_dnn/infer_cpu.sh delete mode 100644 PaddleRec/youbube_dnn/infer_gpu.sh delete mode 100644 PaddleRec/youbube_dnn/rec_topk.sh delete mode 100644 PaddleRec/youbube_dnn/train_cpu.sh delete mode 100644 PaddleRec/youbube_dnn/train_gpu.sh delete mode 100644 PaddleRec/youbube_dnn/youtubednn.py diff --git a/PaddleRec/ctr/README.md b/PaddleRec/ctr/README.md deleted file mode 100644 index 20801819..00000000 --- a/PaddleRec/ctr/README.md +++ /dev/null @@ -1,7 +0,0 @@ - -# Click-Through Rate prediction - -## 简介 -我们提供了常见的ctr任务中使用的模型,包括[dnn](https://github.com/PaddlePaddle/models/tree/develop/PaddleRec/ctr/dnn)、[deepfm](https://github.com/PaddlePaddle/models/tree/develop/PaddleRec/ctr/deepfm)、[xdeepfm](https://github.com/PaddlePaddle/models/tree/develop/PaddleRec/ctr/xdeepfm)和[dcn](https://github.com/PaddlePaddle/models/tree/develop/PaddleRec/ctr/dcn)。 - -同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/124378) diff --git a/PaddleRec/ctr/dcn/README.md b/PaddleRec/ctr/dcn/README.md index 1ac51ddc..44a9f26c 100644 --- a/PaddleRec/ctr/dcn/README.md +++ b/PaddleRec/ctr/dcn/README.md @@ -1,109 +1,7 @@ # Deep & Cross Network -以下是本例的简要目录结构及说明: +models/PaddleRec只是提供了经典推荐算法的Paddle实现,我们已经开源了功能更强大的工具组件[PaddlePaddle/PaddleRec](https://github.com/PaddlePaddle/PaddleRec) 打通了推荐算法+分布式训练全流程,并提供了高级API,在单机和分布式间可以实现无缝切换。后续我们将在[PaddlePaddle/PaddleRec](https://github.com/PaddlePaddle/PaddleRec) Repo中发布新的模型和功能,models/PaddleRec不再更新维护。 -```text -. -├── README.md # 文档 -├── local_train.py # 本地训练脚本 -├── infer.py # 预测脚本 -├── network.py # 网络结构 -├── config.py # 参数配置 -├── reader.py # 读取数据相关的函数 -├── utils.py # 通用函数 -├── data/ - ├── download.sh # 下载数据脚本 - ├── preprocess.py # 数据预处理脚本 -├── dist_data/ - ├── dist_data_download.sh # 下载单机模拟多机小样本数据脚本 - ├── preprocess_dist.py # 小样本数据预处理脚本 - -``` ## 介绍 DCN模型介绍可以参阅论文[Deep & Cross Network for Ad Click Predictions](https://arxiv.org/abs/1708.05123) - -## 环境 -- **目前模型库下模型均要求使用PaddlePaddle 1.6及以上版本或适当的develop版本** - -## 数据下载 - -我们在Criteo数据集训练测试DCN。整个数据集包含约4500万条记录。每一行第一列是label,表示该条广告是否被点击,剩下的是13个整数型特征(I1 - I13)和26个离散型特征(C1 - C26)。 - -数据下载命令 -```bash -cd data && python download.py -``` - -## 数据处理 - -- 根据论文,使用前6天的数据进行训练(大约41million),第7天的数据一半做valid一半做test。基本上是将数据集按照9:0.5:0.5切分,需要注意的是train数据是前90%。而如xdeepfm等论文实验中8:1:1,并且是完全打乱的。 -- 论文对整数型特征数据使用了log transform,因为只有I2最小值为-3,其余最小值为0,所以对I2采用log(4 + l2_value)对其余采用log(1 + l*_value)。 -- 统计每个离散型特征(即C1 - C26)出现的不同feature id,存在大量的低频feature id。所以需要对低频feature id进行过滤,缩小embedding matrix大小。代码默认设置的频率是10,去掉了大量低频feature id。 - -数据预处理命令 -```bash -python preprocess.py -``` - -数据预处理后,训练数据在train中,验证和测试数据在test_valid中,vocab存储离散型特征过滤低频后的feature id。并统计了整数型特征的最小/最大值,离散型特征的feature id数量。 - -## 本地训练 - -```bash -nohup python -u local_train.py > train.log & -``` -训练过程中每隔固定的steps(默认为100)输出当前total loss(logloss + 正则), log loss和auc,可以在args.py中调整print_steps。 - -## 本地预测 -```bash -nohup python -u infer.py --test_epoch 2 > test.log & -``` -注意:最后一行的auc是整个预测数据集的auc - -## 结果 -本结果在Linux CPU机器上使用dataset开启20线程训练,batch size为512。经过150000 steps(~1.87 epoch)后,预测实验结果如下: -```text -loss: [0.44703564] auc_val: [0.80654419] -``` - -## 多机训练 -首先使用命令下载并预处理小规模样例数据集: -```bash -cd dist_data && python dist_download.py && cd .. -``` -运行命令本地模拟多机场景,默认使用2 X 2,即2个pserver,2个trainer的方式组网训练。 - -**注意:在多机训练中,建议使用Paddle 1.6版本以上或[最新版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev)。** - -```bash -# 该sh不支持Windows -sh cluster_train.sh -``` -参数说明: -- train_data_dir: 训练数据目录 -- model_output_dir: 模型保存目录 -- is_local: 是否单机本地训练(单机模拟多机分布式训练是为0) -- is_sparse: embedding是否使用sparse。如果没有设置,默认是False -- role: 进程角色(pserver或trainer) -- endpoints: 所有pserver地址和端口 -- current_endpoint: 当前pserver(role是pserver)端口和地址 -- trainers: trainer数量 - -其他参数见cluster_train.py - -预测 -```bash -python infer.py --model_output_dir cluster_model --test_epoch 10 --test_valid_data_dir dist_data/dist_test_valid_data --vocab_dir dist_data/vocab --cat_feat_num dist_data/cat_feature_num.txt -``` -注意: - -- 本地模拟需要关闭代理,e.g. unset http_proxy, unset https_proxy - -- 0号trainer保存模型参数 - -- 每次训练完成后需要手动停止pserver进程,使用以下命令查看pserver进程: - ->ps -ef | grep python - -- 数据读取使用dataset模式,目前仅支持运行在Linux环境下 diff --git a/PaddleRec/ctr/dcn/config.py b/PaddleRec/ctr/dcn/config.py deleted file mode 100644 index 8dac8ef3..00000000 --- a/PaddleRec/ctr/dcn/config.py +++ /dev/null @@ -1,85 +0,0 @@ -import argparse -""" -global params -""" - - -def boolean_string(s): - if s.lower() not in {'false', 'true'}: - raise ValueError('Not a valid boolean string') - return s.lower() == 'true' - - -def parse_args(): - parser = argparse.ArgumentParser(description="PaddleFluid DCN demo") - parser.add_argument( - '--train_data_dir', - type=str, - default='data/train', - help='The path of train data') - parser.add_argument( - '--test_valid_data_dir', - type=str, - default='data/test_valid', - help='The path of test and valid data') - parser.add_argument( - '--vocab_dir', - type=str, - default='data/vocab', - help='The path of generated vocabs') - parser.add_argument( - '--cat_feat_num', - type=str, - default='data/cat_feature_num.txt', - help='The path of generated cat_feature_num.txt') - parser.add_argument( - '--batch_size', type=int, default=512, help="Batch size") - parser.add_argument( - '--steps', - type=int, - default=150000, - help="Early stop steps in training. If set, num_epoch will not work") - parser.add_argument('--num_epoch', type=int, default=2, help="train epoch") - parser.add_argument( - '--model_output_dir', - type=str, - default='models', - help='The path for model to store') - parser.add_argument( - '--num_thread', type=int, default=20, help='The number of threads') - parser.add_argument('--test_epoch', type=str, default='1') - parser.add_argument( - '--dnn_hidden_units', - nargs='+', - type=int, - default=[1024, 1024], - help='DNN layers and hidden units') - parser.add_argument( - '--cross_num', - type=int, - default=6, - help='The number of Cross network layers') - parser.add_argument('--lr', type=float, default=1e-4, help='Learning rate') - parser.add_argument( - '--l2_reg_cross', - type=float, - default=1e-5, - help='Cross net l2 regularizer coefficient') - parser.add_argument( - '--use_bn', - type=boolean_string, - default=True, - help='Whether use batch norm in dnn part') - parser.add_argument( - '--is_sparse', - action='store_true', - required=False, - default=False, - help='embedding will use sparse or not, (default: False)') - parser.add_argument( - '--clip_by_norm', type=float, default=100.0, help="gradient clip norm") - parser.add_argument('--print_steps', type=int, default=100) - parser.add_argument( - '--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.') - - return parser.parse_args() diff --git a/PaddleRec/ctr/dcn/data/download.py b/PaddleRec/ctr/dcn/data/download.py deleted file mode 100644 index b2fedfe8..00000000 --- a/PaddleRec/ctr/dcn/data/download.py +++ /dev/null @@ -1,24 +0,0 @@ -import os -import sys -import io - -LOCAL_PATH = os.path.dirname(os.path.abspath(__file__)) -TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools") -sys.path.append(TOOLS_PATH) - -from tools import download_file_and_uncompress - -if __name__ == '__main__': - trainfile = 'train.txt' - url = "https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz" - - print("download and extract starting...") - download_file_and_uncompress(url) - print("download and extract finished") - - count = 0 - for _ in io.open(trainfile, 'r', encoding='utf-8'): - count += 1 - - print("total records: %d" % count) - print("done") diff --git a/PaddleRec/ctr/dcn/data/preprocess.py b/PaddleRec/ctr/dcn/data/preprocess.py deleted file mode 100644 index dd23c7dd..00000000 --- a/PaddleRec/ctr/dcn/data/preprocess.py +++ /dev/null @@ -1,131 +0,0 @@ -from __future__ import print_function, absolute_import, division - -import os -import sys -from collections import Counter -import numpy as np -""" -preprocess Criteo train data, generate extra statistic files for model input. -""" -# input filename -FILENAME = 'train.txt' - -# global vars -CAT_FEATURE_NUM = 'cat_feature_num.txt' -INT_FEATURE_MINMAX = 'int_feature_minmax.txt' -VOCAB_DIR = 'vocab' -TRAIN_DIR = 'train' -TEST_VALID_DIR = 'test_valid' -SPLIT_RATIO = 0.9 -FREQ_THR = 10 - -INT_COLUMN_NAMES = ['I' + str(i) for i in range(1, 14)] -CAT_COLUMN_NAMES = ['C' + str(i) for i in range(1, 27)] - - -def check_statfiles(): - """ - check if statistic files of Criteo exists - :return: - """ - statsfiles = [CAT_FEATURE_NUM, INT_FEATURE_MINMAX] + [ - os.path.join(VOCAB_DIR, cat_fn + '.txt') for cat_fn in CAT_COLUMN_NAMES - ] - if all([os.path.exists(fn) for fn in statsfiles]): - return True - return False - - -def create_statfiles(): - """ - create statistic files of Criteo, including: - min/max of interger features - counts of categorical features - vocabs of each categorical features - :return: - """ - int_minmax_list = [[sys.maxsize, -sys.maxsize] - for _ in range(13)] # count integer feature min max - cat_ct_list = [Counter() for _ in range(26)] # count categorical features - for idx, line in enumerate(open(FILENAME)): - spls = line.rstrip('\n').split('\t') - assert len(spls) == 40 - - for i in range(13): - if not spls[1 + i]: continue - int_val = int(spls[1 + i]) - int_minmax_list[i][0] = min(int_minmax_list[i][0], int_val) - int_minmax_list[i][1] = max(int_minmax_list[i][1], int_val) - - for i in range(26): - cat_ct_list[i].update([spls[14 + i]]) - - # save min max of integer features - with open(INT_FEATURE_MINMAX, 'w') as f: - for name, minmax in zip(INT_COLUMN_NAMES, int_minmax_list): - print("{} {} {}".format(name, minmax[0], minmax[1]), file=f) - - # remove '' from all cat_set[i] and filter low freq categorical value - cat_set_list = [set() for i in range(len(cat_ct_list))] - for i, ct in enumerate(cat_ct_list): - if '' in ct: del ct[''] - for key in list(ct.keys()): - if ct[key] >= FREQ_THR: - cat_set_list[i].add(key) - - del cat_ct_list - - # create vocab dir - if not os.path.exists(VOCAB_DIR): - os.makedirs(VOCAB_DIR) - - # write vocab file of categorical features - with open(CAT_FEATURE_NUM, 'w') as cat_feat_count_file: - for name, s in zip(CAT_COLUMN_NAMES, cat_set_list): - print('{} {}'.format(name, len(s)), file=cat_feat_count_file) - - vocabfile = os.path.join(VOCAB_DIR, name + '.txt') - - with open(vocabfile, 'w') as f: - for vocab_val in s: - print(vocab_val, file=f) - - -def split_data(): - """ - split train.txt into train and test_valid files. - :return: - """ - if not os.path.exists(TRAIN_DIR): - os.makedirs(TRAIN_DIR) - if not os.path.exists(TEST_VALID_DIR): - os.makedirs(TEST_VALID_DIR) - - fin = open('train.txt', 'r') - data_dir = TRAIN_DIR - fout = open(os.path.join(data_dir, 'part-0'), 'w') - split_idx = int(45840617 * SPLIT_RATIO) - for line_idx, line in enumerate(fin): - if line_idx == split_idx: - fout.close() - data_dir = TEST_VALID_DIR - cur_part_idx = int(line_idx / 200000) - fout = open( - os.path.join(data_dir, 'part-' + str(cur_part_idx)), 'w') - if line_idx % 200000 == 0 and line_idx != 0: - fout.close() - cur_part_idx = int(line_idx / 200000) - fout = open( - os.path.join(data_dir, 'part-' + str(cur_part_idx)), 'w') - fout.write(line) - fout.close() - fin.close() - - -if __name__ == '__main__': - if not check_statfiles(): - print('create statstic files of Criteo...') - create_statfiles() - print('split train.txt...') - split_data() - print('done') diff --git a/PaddleRec/ctr/dcn/dist_data/dist_download.py b/PaddleRec/ctr/dcn/dist_data/dist_download.py deleted file mode 100644 index 662982f6..00000000 --- a/PaddleRec/ctr/dcn/dist_data/dist_download.py +++ /dev/null @@ -1,19 +0,0 @@ -from __future__ import print_function -import os -import sys -LOCAL_PATH = os.path.dirname(os.path.abspath(__file__)) -TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools") -sys.path.append(TOOLS_PATH) - -from tools import download_file_and_uncompress - -if __name__ == '__main__': - url = "https://paddlerec.bj.bcebos.com/deepfm%2Fdist_data_demo.tar.gz" - - print("download and extract starting...") - download_file_and_uncompress(url, savename="dist_data_demo.tar.gz") - print("download and extract finished") - - print("preprocessing...") - os.system("python dist_preprocess.py") - print("preprocess done") \ No newline at end of file diff --git a/PaddleRec/ctr/dcn/dist_data/dist_preprocess.py b/PaddleRec/ctr/dcn/dist_data/dist_preprocess.py deleted file mode 100644 index afad881b..00000000 --- a/PaddleRec/ctr/dcn/dist_data/dist_preprocess.py +++ /dev/null @@ -1,122 +0,0 @@ -from __future__ import print_function, absolute_import, division - -import os -import sys -from collections import Counter -import numpy as np -""" -preprocess Criteo train data, generate extra statistic files for model input. -""" -# input filename -FILENAME = 'dist_data_demo.txt' - -# global vars -CAT_FEATURE_NUM = 'cat_feature_num.txt' -INT_FEATURE_MINMAX = 'int_feature_minmax.txt' -VOCAB_DIR = 'vocab' -TRAIN_DIR = 'dist_train_data' -TEST_DIR = 'dist_test_valid_data' -TRAIN_FILE = os.path.join(TRAIN_DIR, 'tr') -TEST_FILE = os.path.join(TEST_DIR, 'ev') -SPLIT_RATIO = 0.9 -FREQ_THR = 10 - -INT_COLUMN_NAMES = ['I' + str(i) for i in range(1, 14)] -CAT_COLUMN_NAMES = ['C' + str(i) for i in range(1, 27)] - - -def check_statfiles(): - """ - check if statistic files of Criteo exists - :return: - """ - statsfiles = [CAT_FEATURE_NUM, INT_FEATURE_MINMAX] + [ - os.path.join(VOCAB_DIR, cat_fn + '.txt') for cat_fn in CAT_COLUMN_NAMES - ] - if all([os.path.exists(fn) for fn in statsfiles]): - return True - return False - - -def create_statfiles(): - """ - create statistic files of Criteo, including: - min/max of interger features - counts of categorical features - vocabs of each categorical features - :return: - """ - int_minmax_list = [[sys.maxsize, -sys.maxsize] - for _ in range(13)] # count integer feature min max - cat_ct_list = [Counter() for _ in range(26)] # count categorical features - for idx, line in enumerate(open(FILENAME)): - spls = line.rstrip('\n').split('\t') - assert len(spls) == 40 - - for i in range(13): - if not spls[1 + i]: continue - int_val = int(spls[1 + i]) - int_minmax_list[i][0] = min(int_minmax_list[i][0], int_val) - int_minmax_list[i][1] = max(int_minmax_list[i][1], int_val) - - for i in range(26): - cat_ct_list[i].update([spls[14 + i]]) - - # save min max of integer features - with open(INT_FEATURE_MINMAX, 'w') as f: - for name, minmax in zip(INT_COLUMN_NAMES, int_minmax_list): - print("{} {} {}".format(name, minmax[0], minmax[1]), file=f) - - # remove '' from all cat_set[i] and filter low freq categorical value - cat_set_list = [set() for i in range(len(cat_ct_list))] - for i, ct in enumerate(cat_ct_list): - if '' in ct: del ct[''] - for key in list(ct.keys()): - if ct[key] >= FREQ_THR: - cat_set_list[i].add(key) - - del cat_ct_list - - # create vocab dir - if not os.path.exists(VOCAB_DIR): - os.makedirs(VOCAB_DIR) - - # write vocab file of categorical features - with open(CAT_FEATURE_NUM, 'w') as cat_feat_count_file: - for name, s in zip(CAT_COLUMN_NAMES, cat_set_list): - print('{} {}'.format(name, len(s)), file=cat_feat_count_file) - - vocabfile = os.path.join(VOCAB_DIR, name + '.txt') - - with open(vocabfile, 'w') as f: - for vocab_val in s: - print(vocab_val, file=f) - - -def split_data(): - """ - split train.txt into train and test_valid files. - :return: - """ - if not os.path.exists(TRAIN_DIR): - os.makedirs(TRAIN_DIR) - if not os.path.exists(TEST_DIR): - os.makedirs(TEST_DIR) - - all_lines = [] - for line in open(FILENAME): - all_lines.append(line) - split_line_idx = int(len(all_lines) * SPLIT_RATIO) - with open(TRAIN_FILE, 'w') as f: - f.writelines(all_lines[:split_line_idx]) - with open(TEST_FILE, 'w') as f: - f.writelines(all_lines[split_line_idx:]) - - -if __name__ == '__main__': - if not check_statfiles(): - print('create statstic files of Criteo...') - create_statfiles() - print('split train.txt...') - split_data() - print('done') diff --git a/PaddleRec/ctr/dcn/network.py b/PaddleRec/ctr/dcn/network.py deleted file mode 100644 index 8dd65038..00000000 --- a/PaddleRec/ctr/dcn/network.py +++ /dev/null @@ -1,143 +0,0 @@ -from __future__ import print_function, absolute_import, division -import paddle.fluid as fluid -from collections import OrderedDict -""" -DCN network -""" - - -class DCN(object): - def __init__(self, - cross_num=2, - dnn_hidden_units=(128, 128), - l2_reg_cross=1e-5, - dnn_use_bn=False, - clip_by_norm=None, - cat_feat_dims_dict=None, - is_sparse=False): - self.cross_num = cross_num - self.dnn_hidden_units = dnn_hidden_units - self.l2_reg_cross = l2_reg_cross - self.dnn_use_bn = dnn_use_bn - self.clip_by_norm = clip_by_norm - self.cat_feat_dims_dict = cat_feat_dims_dict if cat_feat_dims_dict else OrderedDict( - ) - self.is_sparse = is_sparse - - self.dense_feat_names = ['I' + str(i) for i in range(1, 14)] - self.sparse_feat_names = ['C' + str(i) for i in range(1, 27)] - target = ['label'] - - # {feat_name: dims} - self.feat_dims_dict = OrderedDict( - [(feat_name, 1) for feat_name in self.dense_feat_names]) - self.feat_dims_dict.update(self.cat_feat_dims_dict) - - self.net_input = None - self.loss = None - - def build_network(self, is_test=False): - # data input - self.target_input = fluid.data( - name='label', shape=[None, 1], dtype='float32') - - data_dict = OrderedDict() - for feat_name in self.feat_dims_dict: - data_dict[feat_name] = fluid.data( - name=feat_name, shape=[None, 1], dtype='float32') - - self.net_input = self._create_embedding_input(data_dict) - - deep_out = self._deep_net(self.net_input, self.dnn_hidden_units, - self.dnn_use_bn, is_test) - cross_out, l2_reg_cross_loss = self._cross_net(self.net_input, - self.cross_num) - last_out = fluid.layers.concat([deep_out, cross_out], axis=-1) - logit = fluid.layers.fc(last_out, 1) - self.prob = fluid.layers.sigmoid(logit) - self.data_list = [self.target_input] + [ - data_dict[dense_name] for dense_name in self.dense_feat_names - ] + [data_dict[sparse_name] for sparse_name in self.sparse_feat_names] - - # auc - prob_2d = fluid.layers.concat([1 - self.prob, self.prob], 1) - label_int = fluid.layers.cast(self.target_input, 'int64') - auc_var, batch_auc_var, self.auc_states = fluid.layers.auc( - input=prob_2d, label=label_int, slide_steps=0) - self.auc_var = auc_var - - # logloss - logloss = fluid.layers.log_loss(self.prob, self.target_input) - self.avg_logloss = fluid.layers.reduce_mean(logloss) - - # reg_coeff * l2_reg_cross - l2_reg_cross_loss = self.l2_reg_cross * l2_reg_cross_loss - self.loss = self.avg_logloss + l2_reg_cross_loss - - def backward(self, lr): - p_g_clip = fluid.backward.append_backward(loss=self.loss) - clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_by_norm) - p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip) - - optimizer = fluid.optimizer.Adam(learning_rate=lr, grad_clip=clip) - # params_grads = optimizer.backward(self.loss) - optimizer.apply_gradients(p_g_clip) - - def _deep_net(self, input, hidden_units, use_bn=False, is_test=False): - for units in hidden_units: - input = fluid.layers.fc(input=input, size=units) - if use_bn: - input = fluid.layers.batch_norm(input, is_test=is_test) - input = fluid.layers.relu(input) - return input - - def _cross_layer(self, x0, x, prefix): - input_dim = x0.shape[-1] - w = fluid.layers.create_parameter( - [input_dim], dtype='float32', name=prefix + "_w") - b = fluid.layers.create_parameter( - [input_dim], dtype='float32', name=prefix + "_b") - xw = fluid.layers.reduce_sum(x * w, dim=1, keep_dim=True) # (N, 1) - return x0 * xw + b + x, w - - def _cross_net(self, input, num_corss_layers): - x = x0 = input - l2_reg_cross_list = [] - for i in range(num_corss_layers): - x, w = self._cross_layer(x0, x, "cross_layer_{}".format(i)) - l2_reg_cross_list.append(self._l2_loss(w)) - l2_reg_cross_loss = fluid.layers.reduce_sum( - fluid.layers.concat( - l2_reg_cross_list, axis=-1)) - return x, l2_reg_cross_loss - - def _l2_loss(self, w): - return fluid.layers.reduce_sum(fluid.layers.square(w)) - - def _create_embedding_input(self, data_dict): - # sparse embedding - sparse_emb_dict = OrderedDict((name, fluid.embedding( - input=fluid.layers.cast( - data_dict[name], dtype='int64'), - size=[ - self.feat_dims_dict[name] + 1, - 6 * int(pow(self.feat_dims_dict[name], 0.25)) - ], - is_sparse=self.is_sparse)) for name in self.sparse_feat_names) - - # combine dense and sparse_emb - dense_input_list = [ - data_dict[name] for name in data_dict if name.startswith('I') - ] - sparse_emb_list = list(sparse_emb_dict.values()) - - sparse_input = fluid.layers.concat(sparse_emb_list, axis=-1) - sparse_input = fluid.layers.flatten(sparse_input) - - dense_input = fluid.layers.concat(dense_input_list, axis=-1) - dense_input = fluid.layers.flatten(dense_input) - dense_input = fluid.layers.cast(dense_input, 'float32') - - net_input = fluid.layers.concat([dense_input, sparse_input], axis=-1) - - return net_input diff --git a/PaddleRec/ctr/dcn/reader.py b/PaddleRec/ctr/dcn/reader.py deleted file mode 100644 index d121f9fd..00000000 --- a/PaddleRec/ctr/dcn/reader.py +++ /dev/null @@ -1,94 +0,0 @@ -""" -dataset and reader -""" -import math -import sys -import paddle.fluid.incubate.data_generator as dg -import pickle -from collections import Counter -import os - - -class CriteoDataset(dg.MultiSlotDataGenerator): - def setup(self, vocab_dir): - self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - self.cont_max_ = [ - 5775, 257675, 65535, 969, 23159456, 431037, 56311, 6047, 29019, 11, - 231, 4008, 7393 - ] - self.cont_diff_ = [ - self.cont_max_[i] - self.cont_min_[i] - for i in range(len(self.cont_min_)) - ] - self.cont_idx_ = list(range(1, 14)) - self.cat_idx_ = list(range(14, 40)) - - dense_feat_names = ['I' + str(i) for i in range(1, 14)] - sparse_feat_names = ['C' + str(i) for i in range(1, 27)] - target = ['label'] - - self.label_feat_names = target + dense_feat_names + sparse_feat_names - - self.cat_feat_idx_dict_list = [{} for _ in range(26)] - for i in range(26): - lookup_idx = 1 # remain 0 for default value - for line in open( - os.path.join(vocab_dir, 'C' + str(i + 1) + '.txt')): - self.cat_feat_idx_dict_list[i][line.strip()] = lookup_idx - lookup_idx += 1 - - def _process_line(self, line): - features = line.rstrip('\n').split('\t') - label_feat_list = [[] for _ in range(40)] - for idx in self.cont_idx_: - if features[idx] == '': - label_feat_list[idx].append(0) - else: - # 0-1 minmax norm - # label_feat_list[idx].append((float(features[idx]) - self.cont_min_[idx - 1]) / - # self.cont_diff_[idx - 1]) - # log transform - label_feat_list[idx].append( - math.log(4 + float(features[idx])) - if idx == 2 else math.log(1 + float(features[idx]))) - for idx in self.cat_idx_: - if features[idx] == '' or features[ - idx] not in self.cat_feat_idx_dict_list[idx - 14]: - label_feat_list[idx].append(0) - else: - label_feat_list[idx].append(self.cat_feat_idx_dict_list[ - idx - 14][features[idx]]) - label_feat_list[0].append(int(features[0])) - return label_feat_list - - def test_reader(self, filelist, batch, buf_size): - print(filelist) - - def local_iter(): - for fname in filelist: - with open(fname.strip(), 'r') as fin: - for line in fin: - label_feat_list = self._process_line(line) - yield label_feat_list - - import paddle - batch_iter = fluid.io.batch( - fluid.io.buffered( - local_iter, size=buf_size), batch_size=batch) - return batch_iter - - def generate_sample(self, line): - def data_iter(): - label_feat_list = self._process_line(line) - yield list(zip(self.label_feat_names, label_feat_list)) - - return data_iter - - -if __name__ == '__main__': - criteo_dataset = CriteoDataset() - if len(sys.argv) <= 1: - sys.stderr.write("feat_dict needed for criteo reader.") - exit(1) - criteo_dataset.setup(sys.argv[1]) - criteo_dataset.run_from_stdin() diff --git a/PaddleRec/ctr/dcn/utils.py b/PaddleRec/ctr/dcn/utils.py deleted file mode 100644 index 779b129e..00000000 --- a/PaddleRec/ctr/dcn/utils.py +++ /dev/null @@ -1,24 +0,0 @@ -import sys -import paddle.fluid as fluid -import logging - -logging.basicConfig() -logger = logging.getLogger(__name__) - -__all__ = ['check_version'] - - -def check_version(): - """ - Log error and exit when the installed version of paddlepaddle is - not satisfied. - """ - err = "PaddlePaddle version 1.6 or higher is required, " \ - "or a suitable develop version is satisfied as well. \n" \ - "Please make sure the version is good with your code." \ - - try: - fluid.require_version('1.6.0') - except Exception as e: - logger.error(err) - sys.exit(1) diff --git a/PaddleRec/ctr/deepfm/README.md b/PaddleRec/ctr/deepfm/README.md index ace75e54..e54f5005 100644 --- a/PaddleRec/ctr/deepfm/README.md +++ b/PaddleRec/ctr/deepfm/README.md @@ -1,99 +1,3 @@ - # DeepFM for CTR Prediction -## Introduction -This model implementation reproduces the result of the paper "DeepFM: A Factorization-Machine based Neural Network for CTR Prediction" on Criteo dataset. - -```text -@inproceedings{guo2017deepfm, - title={DeepFM: A Factorization-Machine based Neural Network for CTR Prediction}, - author={Huifeng Guo, Ruiming Tang, Yunming Ye, Zhenguo Li and Xiuqiang He}, - booktitle={the Twenty-Sixth International Joint Conference on Artificial Intelligence (IJCAI)}, - pages={1725--1731}, - year={2017} -} -``` - -## Environment -- **Now all models in PaddleRec require PaddlePaddle version 1.6 or higher, or suitable develop version.** - -## Download and preprocess data - -We evaluate the effectiveness of our implemented DeepFM on Criteo dataset. The dataset was used for the [Display Advertising Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge/) hosted by Kaggle and includes 45 million users'click records. Each row is the features for an ad display and the first column is a label indicating whether this ad has been clicked or not. There are 13 continuous features and 26 categorical ones. - -To preprocess the raw dataset, we min-max normalize continuous features to [0, 1] and filter categorical features that occur less than 10 times. The dataset is randomly splited into two parts: 90% is for training, while the rest 10% is for testing. - -Download and preprocess data: -```bash -cd data && python download_preprocess.py && cd .. -``` - -After executing these commands, 3 folders "train_data", "test_data" and "aid_data" will be generated. The folder "train_data" contains 90% of the raw data, while the rest 10% is in "test_data". The folder "aid_data" contains a created feature dictionary "feat_dict.pkl2". - -## Local Train - -```bash -nohup python local_train.py --model_output_dir models >> train_log 2>&1 & -``` - -## Local Infer -```bash -nohup python infer.py --model_output_dir models --test_epoch 1 >> infer_log 2>&1 & -``` -Note: The last log info is the total Logloss and AUC for all test data. - -## Result -Reproducing this result requires training with default hyperparameters. The default hyperparameter is shown in `args.py`. Using the default hyperparameters (10 threads, 100 batch size, etc.), it takes about 1.8 hours for CPUs to iterate the training data for one round. - -When the training set is iterated to the 22nd round, the testing Logloss is `0.44797` and the testing AUC is `0.8046`. -

-
-

- -## Distributed Train -We emulate distributed training on a local machine. In default, we use 2 X 2,i.e. 2 pservers X 2 trainers。 - -**Note: we suggest to use Paddle >= 1.6 or [the latest Paddle](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev) in distributed train.** - -### Download and preprocess distributed demo dataset -This small demo dataset(a few lines from Criteo dataset) only test if distributed training can train. -```bash -cd dist_data && python dist_data_download.py && cd .. -``` - -### Distributed Train and Infer -Train -```bash -# 该sh不支持Windows -sh cluster_train.sh -``` -params of cluster_train.sh: -- train_data_dir: path of train data -- model_output_dir: path of saved model -- is_local: local or distributed training(set 0 in distributed training) -- is_sparse: whether to use sparse update in embedding. If not set, default is flase. -- role: role of process(pserver or trainer) -- endpoints: ip:port of all pservers -- current_endpoint: ip:port of current pserver(role should be pserver) -- trainers: the number of trainers - -other params explained in cluster_train.py - -Infer -```bash -python infer.py --model_output_dir cluster_model --test_epoch 10 --num_feat 141443 --test_data_dir=dist_data/dist_test_data --feat_dict='dist_data/aid_data/feat_dict_10.pkl2' -``` - -Notes: -- **Proxy must be closed**, e.g. unset http_proxy, unset https_proxy. - -- The first trainer(with trainer_id 0) saves model params. - -- After each training, pserver processes should be stop manually. You can use command below: - ->ps -ef | grep python - -- We use Dataset API to load data,it's only supported on Linux now. - -## Distributed Train with Fleet -Fleet is High-Level API for distributed training in PaddlePaddle. See [DeepFM example](https://github.com/PaddlePaddle/Fleet/tree/develop/examples/deepFM) in Fleet Repo. +models/PaddleRec只是提供了经典推荐算法的Paddle实现,我们已经开源了功能更强大的工具组件[PaddlePaddle/PaddleRec](https://github.com/PaddlePaddle/PaddleRec) 打通了推荐算法+分布式训练全流程,并提供了高级API,在单机和分布式间可以实现无缝切换。后续我们将在[PaddlePaddle/PaddleRec](https://github.com/PaddlePaddle/PaddleRec) Repo中发布新的模型和功能,models/PaddleRec不再更新维护。 diff --git a/PaddleRec/ctr/deepfm/args.py b/PaddleRec/ctr/deepfm/args.py deleted file mode 100644 index d28e64ec..00000000 --- a/PaddleRec/ctr/deepfm/args.py +++ /dev/null @@ -1,73 +0,0 @@ -import argparse - - -def parse_args(): - parser = argparse.ArgumentParser(description="PaddlePaddle CTR example") - parser.add_argument( - '--train_data_dir', - type=str, - default='data/train_data', - help='The path of train data (default: data/train_data)') - parser.add_argument( - '--test_data_dir', - type=str, - default='data/test_data', - help='The path of test data (default: models)') - parser.add_argument( - '--feat_dict', - type=str, - default='data/aid_data/feat_dict_10.pkl2', - help='The path of feat_dict') - parser.add_argument( - '--batch_size', - type=int, - default=100, - help="The size of mini-batch (default:100)") - parser.add_argument( - '--embedding_size', - type=int, - default=10, - help="The size for embedding layer (default:10)") - parser.add_argument( - '--num_epoch', - type=int, - default=30, - help="The number of epochs to train (default: 10)") - parser.add_argument( - '--model_output_dir', - type=str, - required=True, - help='The path for model to store (default: models)') - parser.add_argument( - '--num_thread', - type=int, - default=10, - help='The number of threads (default: 10)') - parser.add_argument('--test_epoch', type=str, default='1') - parser.add_argument( - '--layer_sizes', - nargs='+', - type=int, - default=[400, 400, 400], - help='The size of each layers (default: [400, 400, 400])') - parser.add_argument( - '--act', - type=str, - default='relu', - help='The activation of each layers (default: relu)') - parser.add_argument( - '--is_sparse', - action='store_true', - required=False, - default=False, - help='embedding will use sparse or not, (default: False)') - parser.add_argument( - '--lr', type=float, default=1e-4, help='Learning rate (default: 1e-4)') - parser.add_argument( - '--reg', type=float, default=1e-4, help=' (default: 1e-4)') - parser.add_argument('--num_field', type=int, default=39) - parser.add_argument('--num_feat', type=int, default=1086460) # 2090493 - parser.add_argument( - '--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.') - - return parser.parse_args() diff --git a/PaddleRec/ctr/deepfm/data/aid_data/train_file_idx.txt b/PaddleRec/ctr/deepfm/data/aid_data/train_file_idx.txt deleted file mode 100644 index dc9ab9bd..00000000 --- a/PaddleRec/ctr/deepfm/data/aid_data/train_file_idx.txt +++ /dev/null @@ -1 +0,0 @@ -[143, 174, 214, 126, 27, 100, 74, 15, 83, 167, 87, 13, 90, 107, 1, 123, 76, 59, 44, 22, 203, 75, 216, 169, 101, 229, 63, 183, 112, 140, 91, 14, 115, 211, 227, 171, 51, 173, 137, 194, 223, 159, 168, 182, 208, 215, 7, 41, 120, 16, 77, 0, 220, 109, 166, 156, 29, 26, 95, 102, 196, 151, 98, 42, 163, 40, 114, 199, 35, 225, 179, 17, 62, 86, 149, 180, 133, 54, 170, 55, 68, 8, 99, 135, 181, 46, 134, 118, 201, 148, 210, 79, 25, 116, 38, 158, 141, 81, 37, 49, 39, 61, 34, 9, 150, 121, 65, 185, 213, 3, 11, 190, 20, 157, 108, 47, 24, 198, 104, 222, 127, 50, 4, 202, 142, 218, 48, 186, 32, 130, 85, 191, 53, 221, 224, 128, 33, 165, 172, 110, 69, 72, 152, 19, 88, 18, 119, 117, 111, 66, 177, 92, 106, 228, 212, 89, 195, 21, 113, 58, 43, 164, 138, 23, 70, 73, 178, 5, 122, 139, 97, 161, 162, 30, 136, 155, 93, 132, 52, 105, 80, 36, 10, 204, 45, 192, 125, 219, 209, 129, 124, 67, 176, 205, 154, 31, 60, 153, 146, 207, 56, 6, 71, 82, 217, 84, 226] \ No newline at end of file diff --git a/PaddleRec/ctr/deepfm/data/download_preprocess.py b/PaddleRec/ctr/deepfm/data/download_preprocess.py deleted file mode 100644 index 05461023..00000000 --- a/PaddleRec/ctr/deepfm/data/download_preprocess.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -import shutil -import sys - -LOCAL_PATH = os.path.dirname(os.path.abspath(__file__)) -TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools") -sys.path.append(TOOLS_PATH) - -from tools import download_file_and_uncompress, download_file - -if __name__ == '__main__': - url = "https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz" - url2 = "https://paddlerec.bj.bcebos.com/deepfm%2Ffeat_dict_10.pkl2" - - print("download and extract starting...") - download_file_and_uncompress(url) - download_file(url2, "./aid_data/feat_dict_10.pkl2", True) - print("download and extract finished") - - print("preprocessing...") - os.system("python preprocess.py") - print("preprocess done") - - shutil.rmtree("raw_data") - print("done") diff --git a/PaddleRec/ctr/deepfm/data/preprocess.py b/PaddleRec/ctr/deepfm/data/preprocess.py deleted file mode 100644 index 1fa4a5fe..00000000 --- a/PaddleRec/ctr/deepfm/data/preprocess.py +++ /dev/null @@ -1,101 +0,0 @@ -import os -import numpy -from collections import Counter -import shutil -import pickle - - -def get_raw_data(): - if not os.path.isdir('raw_data'): - os.mkdir('raw_data') - - fin = open('train.txt', 'r') - fout = open('raw_data/part-0', 'w') - for line_idx, line in enumerate(fin): - if line_idx % 200000 == 0 and line_idx != 0: - fout.close() - cur_part_idx = int(line_idx / 200000) - fout = open('raw_data/part-' + str(cur_part_idx), 'w') - fout.write(line) - fout.close() - fin.close() - - -def split_data(): - split_rate_ = 0.9 - dir_train_file_idx_ = 'aid_data/train_file_idx.txt' - filelist_ = [ - 'raw_data/part-%d' % x for x in range(len(os.listdir('raw_data'))) - ] - - if not os.path.exists(dir_train_file_idx_): - train_file_idx = list( - numpy.random.choice( - len(filelist_), int(len(filelist_) * split_rate_), False)) - with open(dir_train_file_idx_, 'w') as fout: - fout.write(str(train_file_idx)) - else: - with open(dir_train_file_idx_, 'r') as fin: - train_file_idx = eval(fin.read()) - - for idx in range(len(filelist_)): - if idx in train_file_idx: - shutil.move(filelist_[idx], 'train_data') - else: - shutil.move(filelist_[idx], 'test_data') - - -def get_feat_dict(): - freq_ = 10 - dir_feat_dict_ = 'aid_data/feat_dict_' + str(freq_) + '.pkl2' - continuous_range_ = range(1, 14) - categorical_range_ = range(14, 40) - - if not os.path.exists(dir_feat_dict_): - # print('generate a feature dict') - # Count the number of occurrences of discrete features - feat_cnt = Counter() - with open('train.txt', 'r') as fin: - for line_idx, line in enumerate(fin): - if line_idx % 100000 == 0: - print('generating feature dict', line_idx / 45000000) - features = line.rstrip('\n').split('\t') - for idx in categorical_range_: - if features[idx] == '': continue - feat_cnt.update([features[idx]]) - - # Only retain discrete features with high frequency - dis_feat_set = set() - for feat, ot in feat_cnt.items(): - if ot >= freq_: - dis_feat_set.add(feat) - - # Create a dictionary for continuous and discrete features - feat_dict = {} - tc = 1 - # Continuous features - for idx in continuous_range_: - feat_dict[idx] = tc - tc += 1 - for feat in dis_feat_set: - feat_dict[feat] = tc - tc += 1 - # Save dictionary - with open(dir_feat_dict_, 'wb') as fout: - pickle.dump(feat_dict, fout, protocol=2) - print('args.num_feat ', len(feat_dict) + 1) - - -if __name__ == '__main__': - if not os.path.isdir('train_data'): - os.mkdir('train_data') - if not os.path.isdir('test_data'): - os.mkdir('test_data') - if not os.path.isdir('aid_data'): - os.mkdir('aid_data') - - get_raw_data() - split_data() - get_feat_dict() - - print('Done!') diff --git a/PaddleRec/ctr/deepfm/dist_data/dist_data_download.py b/PaddleRec/ctr/deepfm/dist_data/dist_data_download.py deleted file mode 100644 index 63e2756d..00000000 --- a/PaddleRec/ctr/deepfm/dist_data/dist_data_download.py +++ /dev/null @@ -1,22 +0,0 @@ -import os -import shutil -import sys - -LOCAL_PATH = os.path.dirname(os.path.abspath(__file__)) -TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools") -sys.path.append(TOOLS_PATH) - -from tools import download_file_and_uncompress - -if __name__ == '__main__': - url = "https://paddlerec.bj.bcebos.com/deepfm%2Fdist_data_demo.tar.gz" - - print("download and extract starting...") - download_file_and_uncompress(url, savename="dist_data_demo.tar.gz") - print("download and extract finished") - - print("preprocessing...") - os.system("python preprocess_dist.py") - print("preprocess done") - - print("done") \ No newline at end of file diff --git a/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py b/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py deleted file mode 100644 index fd9739e0..00000000 --- a/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py +++ /dev/null @@ -1,75 +0,0 @@ -import os -import numpy -from collections import Counter -import shutil -import pickle - -SPLIT_RATIO = 0.9 -INPUT_FILE = 'dist_data_demo.txt' -TRAIN_FILE = os.path.join('dist_train_data', 'tr') -TEST_FILE = os.path.join('dist_test_data', 'ev') - - -def split_data(): - all_lines = [] - for line in open(INPUT_FILE): - all_lines.append(line) - split_line_idx = int(len(all_lines) * SPLIT_RATIO) - with open(TRAIN_FILE, 'w') as f: - f.writelines(all_lines[:split_line_idx]) - with open(TEST_FILE, 'w') as f: - f.writelines(all_lines[split_line_idx:]) - - -def get_feat_dict(): - freq_ = 10 - dir_feat_dict_ = 'aid_data/feat_dict_' + str(freq_) + '.pkl2' - continuous_range_ = range(1, 14) - categorical_range_ = range(14, 40) - - if not os.path.exists(dir_feat_dict_): - # print('generate a feature dict') - # Count the number of occurrences of discrete features - feat_cnt = Counter() - with open(INPUT_FILE, 'r') as fin: - for line_idx, line in enumerate(fin): - features = line.rstrip('\n').split('\t') - for idx in categorical_range_: - if features[idx] == '': continue - feat_cnt.update([features[idx]]) - - # Only retain discrete features with high frequency - # not filter low freq in small dataset - freq_ = 0 - feat_set = set() - for feat, ot in feat_cnt.items(): - if ot >= freq_: - feat_set.add(feat) - - # Create a dictionary for continuous and discrete features - feat_dict = {} - tc = 1 - # Continuous features - for idx in continuous_range_: - feat_dict[idx] = tc - tc += 1 - for feat in feat_set: - feat_dict[feat] = tc - tc += 1 - with open(dir_feat_dict_, 'wb') as fout: - pickle.dump(feat_dict, fout, protocol=2) - print('args.num_feat ', len(feat_dict) + 1) - - -if __name__ == '__main__': - if not os.path.isdir('dist_train_data'): - os.mkdir('dist_train_data') - if not os.path.isdir('dist_test_data'): - os.mkdir('dist_test_data') - if not os.path.isdir('aid_data'): - os.mkdir('aid_data') - - split_data() - get_feat_dict() - - print('Done!') diff --git a/PaddleRec/ctr/deepfm/network_conf.py b/PaddleRec/ctr/deepfm/network_conf.py deleted file mode 100644 index 609ea12f..00000000 --- a/PaddleRec/ctr/deepfm/network_conf.py +++ /dev/null @@ -1,115 +0,0 @@ -import paddle.fluid as fluid -import math - - -def ctr_deepfm_model(embedding_size, - num_field, - num_feat, - layer_sizes, - act, - reg, - is_sparse=False): - init_value_ = 0.1 - - raw_feat_idx = fluid.data( - name='feat_idx', shape=[None, num_field], dtype='int64') - raw_feat_value = fluid.data( - name='feat_value', shape=[None, num_field], dtype='float32') - label = fluid.data( - name='label', shape=[None, 1], dtype='float32') # None * 1 - - feat_idx = fluid.layers.reshape(raw_feat_idx, - [-1, 1]) # (None * num_field) * 1 - feat_value = fluid.layers.reshape( - raw_feat_value, [-1, num_field, 1]) # None * num_field * 1 - - # -------------------- first order term -------------------- - - first_weights_re = fluid.embedding( - input=feat_idx, - is_sparse=is_sparse, - dtype='float32', - size=[num_feat + 1, 1], - padding_idx=0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.TruncatedNormalInitializer( - loc=0.0, scale=init_value_), - regularizer=fluid.regularizer.L1DecayRegularizer(reg))) - first_weights = fluid.layers.reshape( - first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1 - y_first_order = fluid.layers.reduce_sum((first_weights * feat_value), 1) - - # -------------------- second order term -------------------- - - feat_embeddings_re = fluid.embedding( - input=feat_idx, - is_sparse=is_sparse, - dtype='float32', - size=[num_feat + 1, embedding_size], - padding_idx=0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.TruncatedNormalInitializer( - loc=0.0, scale=init_value_ / math.sqrt(float(embedding_size))))) - feat_embeddings = fluid.layers.reshape( - feat_embeddings_re, - shape=[-1, num_field, - embedding_size]) # None * num_field * embedding_size - feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size - - # sum_square part - summed_features_emb = fluid.layers.reduce_sum(feat_embeddings, - 1) # None * embedding_size - summed_features_emb_square = fluid.layers.square( - summed_features_emb) # None * embedding_size - - # square_sum part - squared_features_emb = fluid.layers.square( - feat_embeddings) # None * num_field * embedding_size - squared_sum_features_emb = fluid.layers.reduce_sum( - squared_features_emb, 1) # None * embedding_size - - y_second_order = 0.5 * fluid.layers.reduce_sum( - summed_features_emb_square - squared_sum_features_emb, 1, - keep_dim=True) # None * 1 - - # -------------------- DNN -------------------- - - y_dnn = fluid.layers.reshape(feat_embeddings, - [-1, num_field * embedding_size]) - for s in layer_sizes: - y_dnn = fluid.layers.fc( - input=y_dnn, - size=s, - act=act, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.TruncatedNormalInitializer( - loc=0.0, scale=init_value_ / math.sqrt(float(10)))), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.TruncatedNormalInitializer( - loc=0.0, scale=init_value_))) - y_dnn = fluid.layers.fc( - input=y_dnn, - size=1, - act=None, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.TruncatedNormalInitializer( - loc=0.0, scale=init_value_)), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.TruncatedNormalInitializer( - loc=0.0, scale=init_value_))) - - # ------------------- DeepFM ------------------ - - predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn) - cost = fluid.layers.log_loss(input=predict, label=label) - batch_cost = fluid.layers.reduce_sum(cost) - - # for auc - predict_2d = fluid.layers.concat([1 - predict, predict], 1) - label_int = fluid.layers.cast(label, 'int64') - auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict_2d, - label=label_int, - slide_steps=0) - - return batch_cost, auc_var, [raw_feat_idx, raw_feat_value, - label], auc_states diff --git a/PaddleRec/ctr/deepfm/picture/deepfm_result.png b/PaddleRec/ctr/deepfm/picture/deepfm_result.png deleted file mode 100644 index 967bb24a8f495af28ba3a963ff5d7c3ce96cdd0d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24149 zcmcG$^;=Zm7e5LJ;-G>sC@7r+NQlxgNT-Y_A_#)CboVewGo&ztQbU8%Al;n;(%s$7 z5Ciw{{(f%!1NW!lInOzJueJ7y*NP4Dt*ZP(qGv=nI5-a#6=dGw;NZh>aBwH?-2tAw z_E?z)ej4AZsLS5m+|*Q6EiEjxwY1F5%xG$AHq_Nw=gSr5=C(97y?*`r;Nal$@^YZB zue`jxySv-b(Gh!%)zQ(hu(0_4{kvn)oAb-FprD|Mv9X-&?9;OoeSQ6rk&)@?>A1K! zPfyRn!ot6c^$nBx-W8wM*4DQ7H*IZgeSCbTxBf*pIo8(JrgaCZsi`F=C$ApNH8wT| z*M2oJGBPnSDIHGQ*x1PHk4{KPNKH)*2?;4FE|!*-c6D{l&(BY6@^*K3N92Bf&7x+2 zgF}y_C?l!vio5d}DYYm<&h=G}0Kc0fg^j5v6iPAOXncHgv&mwHVx3V?R#x&5-BU!F zhzp9>RZTp&!~GiP5e_~M?*C6eo%iufbUE-&-byy&yX7`U2flD;&WDVNbz1Oj;VjN7 z7q_whp;g=J;wOeag5cu6mQciL#VSbkCjc*Dzbgh^f(pbqq3WQ z5K$1bfKl21$azaYukU{ETjfxTfFb*TyS=*WX%~FwLQ&r+f9>xm%0k~SzAV_MvXnJ) zxI6up;5QFt6oja82Vl(yUH{(yx0M5PIrcYEkVnIEnXK7K&?i`db|b!8g(Se0)xcD4 z<$ql5#(X(c*L>uXobmJ%g?JjUG~1*77vi_#A(;xz%=>S*_7omfyVukHk!7@M+0hnC zT~D%jGd}$La}1_)yL8R`@LXYx5LXoyTb%~z|#syWd))dh2L0nFXJ-HIb z%bw-u-N*QSL!HJ1AQodPCSJ!u*24I!(@uDmxrSA0OupCO_a>jU{78)GBe$oC$P&u_ z`rGEP@h zTKx~^QPBijZ=mObQwf1%kaHcnb$QSl-(JYXOj#fqdc5h$J9=Kc>FjT>4;KaC%QLUn z{%k)PQL)Qvgs|UxCDuhQLzB$l84oYWjZHFHQ|!FKOL#8>-6)$|AA*^uKgMgeiochU zg2r@S4rsjTAMpMBbROJ7NelO+IX2peRrYNve0p@`=PE3**79uSOmgG9zdGj4^nm2bmD|$>=Hke7NFNF1D6XFiMNw2NKy1>*I_QwbzNQqNkX19k ziygbSc^>dQrA%Iv?;^Q!+3(EsJH|TX2_y~!H1$t^(LZ};0hv1)6g7EF9i>D32l_5c zm-;dV=NAToer}`Z6bR#{5Ta9)&>9!BNRI7VFShfY#r{4+JoreC?`k?lqOut0gII*@ zQ5`$0aWzD=htA`j8$HW(_n~yuk`z0viy(bC4P1pc2y=x6zfK*uP-g> z*|Ib3BjBv5YfB2c(*Y5+J*Uy$?~i-9rRmYp5py;OZ1;_qW)tQ7r|V{-ZkF7=w09ei z77zyE=T1wYe}(WbNdz-r9E(_`r4O_Ab*Z5ne|7{MJMOxhq(5I|S%)~O@;hr9G7#V8 zg)L!zc+OW+Oye`!{%(YHfrCHJwg+o1CTu?^ZHAmFT~4gZ09$TX&X zl=ueAd8A2?c_{nZA0`D3IvIFwWB3=!ewDQk_Yvie?#_)Ny3_H3`#GJjebtegTT5q+ zv<~9Wn>#LUL2vhk25duj%GZ0YX~~0NeLMc5b)!i$&>qO>3D(j6b%$MVq~Iw78`>jjCZ!9Z*gj$Kr%WT#in2O$ z*>61Tn9uyw%p^H#>)?&Sj~I!5K^Ca2^gNRot+hA~Q?+`+8>(CLrYP|*+Vw@aKs_#2 zM#gZcjtI?M2)KnB5p%L%#A88EgYNf3Zai(Ve&*CNvG1qb%#H&D$Jc|k8>kI*&LetL z<}Vce&5!q7cd3wO^vA!&6U`#n6&;JkS2#=_NQo6 z67}->0Q~jovhxx?@u^V6%F(wpSKxr(d_e-i@%6lt@>Em(JY}GtzsW>j^phZ!ASG9U^e#l{gBVGzqvnVvB3xP-~$2?$YufWw|9S$@^!~3 z4ZX;CHV%ykZv{b_^{~eXB z=ZFz$c?|n$4ENa?eRr=H(s%NjkMh}l3&B`_DvX~u&EL_ViwK>l zeWcoJymF`iuyre-csC$FZw)`3%#+n$C|8?8Ab)u4{hg!EAO+&mNdA{Z@I`ZZ^S&h> z$~iUZV83?@;FDoza0wgSp!{GkVe`Jg&+of^IHK!w#uG2N_kSl6ip{(y(oeUE*!xZl z^OEau;gn|DSIk=lT_}=|t7Gs1B7#^cPxmEq@Ay(oE`3I^qmMfiqi%vEv}R2d7`79w zJo41qo})I5cRV$74wCTwaZXWeUw-c^Mjtzqma|=e%BvBGFI&IMcsu5o?rcHito%QK zU!ud})iNFloUR+1&3q7b-Brvy={Y6Dw%%o&Bw2^3?E`cVz|!dBEyp)yCuTea^|-mm ze`_WFgx%Bi`*hw50>PW@koEuyA^!ttyOMj&J#A_%vl^c_%%JK2OS*{w=e{E6zebGe zEjRzYc>h9?Gy3+uyZ;{+|M%WEEHjZM2b_$h?z0~Omib9{Bc%Ato&WO?4|rYX71a|A zbL`g_Xrmp1pbrrr+zO_qVJv;>6FS0Gk=zvh@E@icQjFyMu1^c{l}g zZzznYzF{p+qu`7pLVrUm{GWX!bR>i?P-Pz(G{Oc$f#u9r=^$1Eu*HF=y z?k*I`s89Rk7)&f1P1S@rymN|r)A?(^D~rz=OATzXcq>y;XutuD)BLT1ft|!&QIi2u zwf%Da|MIT=8f&!Pzu~2ObA7qvuxo8&e|hG-7<>m#mTNb7UOddYHA^X`VJ{p3kW%3M zizYJdK=Gu^!|~X~U>_sQXWz!(`aiV_#i!5JU43eCi? znOZ-cXe8P`=wFhshPYip#@`{X?=qh}fRkmXemVvFej~a>TjIa>8~*`c=DHchM!hy&g%FOiADE&6)TNnU1(<;iFHK0bz_v zf7CK;V-59qeH>e7sqo;``0AKeP*PY8b_nM8)J(^1efCg0;l&kVpLy&zO*oup@29F1 zq@n&hM55pC7iEPeika0&^E`fgly?bveaFKxZ`EQ8@}e=_FZtH=((sLK>GkC%#l0GQ zZCFbghIEQHy-icDu01?GJz#@^Hv=HH(`e11A2EA7?9T`9q&s^_l#(-=tnT!EgZZ$K zAGVIdI&olQqrB2m6xhgH`%Cv6f?-1cFzF`;sX%6?Oa-P+)}I`m4QyuDmYiFK-*ZQ; znc8&`LoVa5z6p9-h;%PKSS=N%m#Zp00h{wchC&yJqyhZmih8L0HL(BG4>tZH{6&Hc zD`^vd)BZ#}{s%={!qxNhpS|gG*YQ{cBKZgFzM>f0&H;@tTY!v;tv^FbGerNuNo?vq zlr+Gua+TrHy?X5BWyEkk@AXwp{c6v36ycK#(c%|nCt%8XH7US!$j+5GV&u36p!*=I z_6=25Tt_LnufLru0$fywae^p_08_Uv$NK$_xG~6loAIA>5|*GJjmxob_K zzFYrw6!d!tSS7kC3ok1p)rK%Q**u4u1nxB0N)d114ueRrdH$JKVHYqa)ZhSIjWuV}a=ZR-6|5@$Qgq@=IwI z@na0LW+(%!65C4e>s*xi?w&dZ(8K@MdpZcaTKg%sM+^3+_J)zaKD=vp41S3qyd$RT zCqwrbdJFwRXQv2PFndKNg^x~U2Dfhm1kN->w8%pKa8Fkw8jrC%rHTh|);hodqJntpzT9#Fm%w2FQ6R z%S=VJmkT&hB+K0ngw$D|A6JZ&8i1t+@T1(C z8$f*X=_^JLaK&_H)!Gs;0xNGcFlL94#(N<{1bIf4`939H#A@@P@u{O<9dMb)iUCMK zv85QTsm+mjBhboWA_wT7;-+kV`n9y-$R{DcK_JsN1v34^9^fks$%fJJ-RI4OaK^M$ z_})5%^(XE_QxiYNY#<=MIwY@)2`?8JX-A zt14B%E5 zDUZHhJ1z*mE2TBDvwQAmr8ZZ`lJKP{w??mAcpg0ZjS)-hWj6yZSDy?@Btl!N7}elN zkM{pei%J?~s7`e;U2lTy%p8FvJR{UV*qPD-zI`N2dvbea9QHuyl_8T35bR43_CdsX z2BQPr+=b`j*>wIVgEr!8z-IY_#p`|r)a~uy4tOo;DkuK!ITGl@JLq@l6GGSxK5+Tp zIs~b5gQ&Hjhxa4ed%Y%;=MvGJavBwZ=aUVhYvA|l%GcLT*`e9)Z^Lt|6@>wTF|cn! zj5bLD?Z2B&gwDuAzc57Vb6QCgaof6uKl&s9FXg6dKyRuB8_Bs&0 z#_yK){k4fA1EgUEzxhc|z%M`Il&cd-tNO#57a{xG!vgis+Jla|9=g}$#LmeZHb7<1 zT?NM^ri(YA#Ni*E2?0pdZ(X7ap_oTe{{1ijQ>o4*6YOtS;5<_U0{6-u;ojbld_k(c z67d3uV8ppX3S37-S){=!bwiXi4-g&HO~6T3y(Q zvH)Q4%n9j`vR~pq4AuX|VQu{_MJMg-Y? zT}cVv^;jj;{}3D?i{VeFMSV}H)_F&*s1OLIU_%I;k!PqlqOo#z?I`gZtW_=p&NKNx z@8OdKSi1~t`QPG{((3)vG4(zu zkg@yWDtL{6UhgNGtvKX~2B7Bc8G<&8pUuq;MP}-Uo0OEChT%$ZdCnG9jLZPc zzJ!o>A2}Wry(f!pP0{Ay<@zLy#3N8&00$<&$IF_gb!y*vlB7K+`987dyrfLmDfvgH zMxWpt=r8O~m3vi@nH$^PNJ%=3jI~@acNfCI*)AP}y11XJ`RBpi_Y0`wkk9!pqj$Dk z=h`1<-sqc6@701?+B`BlA>iuSJ@kp;QCEHizQ+a-Kjba@u8a8u5`U83x?7{L@S!De zDyk(KRi~W=X0w?`z5d!vt8jtD1>C6_-RuW0epguMIMj8g6Z(>{IsW90OtF#F)b< zN1dnJH<*Dw+XiG5rytsH9_sJy4!}0{P;BeOs!@Nr?)xageI%L6?&}gPe!05+5>w3w zj7)gyTudzDU8b*StWQ(~yJb|~HyE=JIZeR<%~tP&wn~BrSH%xXou*Y+;>@}BkPv-*kbiT`QOCjhIQ2Nw7 zGG$gRtRJli{8)&zf(6rMNK&N$f0-C@moU?0YkFk8J;&^P_}}8HeyB*R)0lf{0S+{> zx6p{DJw4{`)yb`Et{R1zG@B^8UNZkq7h8=~B`Tw}6=VFZXmtRTPT9=`H$-i{cRTke zJ}>f?U#;KSqjQ6Iq-djhez`A^kIDPPN&-~VE|9hv!qBUHkxT4VBAOi%wM>+OE^^Qy zUO6Il-}%s>$g5THYmnFSd)K%{!y9KI$3skbS>`+L2Ep3)sc8VoCZB+-ebM4_@FbMK z2<}emYq-6SWe-SnvdhP4WLFFR(h|8Q!iA}JmRQySFe>E^M9YKaim0Fsz(9Y&$V47S%{(0-83sK>TRpz%Z2o_PwCZuj_+SgCpNAA53XoKA0vx0Kf{~IEZ zw5Y0Y4J@TEISB7(nSx=?)}C3GK)*Z%bj=XkYFng;gVT%k;eLYi3iv{W-})dFCCj;FG0S zxT14YdH)oBF2&>f`>4y2HY*OxS~5^Kc~=Kc_n&e12Z3y*N_yDHAbH3^>sjnUdHANY z=Ej<`*nKluQPDpwrDG6Yq>~Z`@d_}tw`5y;!-)jnW~PS}BUjO!P%9)AfG`Z&oTZuqwx?KQ6k^V+}Wm{=6zkKWc4l zPFSaqsqXQ(8a>vJWUIE=J9%O~X%P^0P1&7lea_Ycy15qe&?TN!D)p5MkV>^n1OOHoz9wVcarN+TbF_w-PG`ElY2H9NV9mrk^NQxJ%B z`s78)r%h@XyWbCMKw<>PF0D@v1KHVZi6ZuD3IJ0rMehrE_rGHw;Pb&dP~0(lqwRtG zi(0Xo+uWN+5VIg1{5&KHb0q2B6Owe=7-3t#+-Xlqw)JbJdJH9PFWa|0O zll6J-KZn5$25qK`4s2I%ifbfUuI8IXh{I_&j12XUx(J>bO8pW*?=ySPzEAjJuD5D6 z_w>%;6}Sang=O&d0qTqSJOw(mk zfMTH~d2~q+7p(jF0)&Jzmrfqmi?#UXL?6d| zPLj^un*)<&LjJZ}0Ja0pNpt#NkJ3P${JW zi+YF)kof4;B|4LuNJIJ-Xa!GiH%YLH3U=)$bzmMPE{jU<^deKEE_IFmKz@lVm(1f*= z8qoG{U+t(khUw-~btmbo-drZ}9W`#jj$lp3m!Qr%xDm8F8190Dh& zlS-CsC}82al%(r25rtHyMKtP`TOZ&*utjZ!eKs;S8fqb(rBB&jo z-yH1qeLrAVdN}5jJc9ug9Q3*SNF|Y)2Vh^A%H&Y0SAmPxFy~4P9n_8SXy+hVVOg9m z3p#f#q=3r^iFk6lMLtmS0n>_3zC$+c?eYNnR0Mul>Pu$wGKBh|>+#)6^CUN36)*9W z{tFt8FV(787dwFmAIwHL;Av5P+%2v1U|$|~sC(}-@m->7fG!p-kY)ur>Q(s%Po)>` zeLT*G028j?O&47sUjtybVvRdh>f-ZePnP)EToaXS zNd)<}PUFJI1u(7VMpmt}2|C$%{J>I4?eU$q1=LfNC+}K2nbA=TWR+@m{Q#Nw9ATCo zWsr-c;!t5JIyz=fiTON^7R8y_u%yn=>BjAf#Ayq84zPmw-uCB)x3;ER5<@Q>DE#MGb!NWR+PR@ow;ZLwEk zP|@n4)#_zVnPiMnU>ax;*2|lv*!F+V0(fL4H=3M<3jqOgfe@lQFhiN79HB~@^ql_O zYAdJ-qJBz1H`POh)-*M-ocbZqPVxtRuS2LwmFjxi7xRikS zMif6CLH%JXq^!MqMyf|v{Gffsn!agCRLYC+wA!=?NEGq0%Wa0VqKXKK`gOQe_RA+T zL}(`HFI~o3=e5$Xw+YE9EuS2oHF#ej7kaDeBld$c)Y`W*egT0&$z%0revoI>UD207 zMO~$RE5$6F^moOWBp}){7#}xE;{#Wi+)%VXnUO?O6-qp&^TRm%J>7L6yT+r_!5kn3 z$9CY1N1%zzthe$Z7mEpWP+>{OX7OJuj(<+`E$x#5dAiCjGMQ=0%eT?II&REsh9oOA z(Y}(2kR!#5j?yc990OaQMtRA8gxT}m7>T<;?ey4;!AeC4AMNwypAVk-BAWGFA4Bc@ zI?@Sd`PL3SED+qr5sg1KNzx8M8*9!BR8e$TBg7hCtn}Eov?J!!nlFGq{iY&=QLIG* zvYvqe0Z>ipgZy(OqW+QabC!wcjOYP7ag5)xf`k^4lI_S)i)wi>LByWsdF}Uz5#|Z8 zGa>%R(8zlR(w{BnzHW#15O;i~{UhQaEyWO#et=XAD`tP*#*q29H~nfIVkIf@b5mQ- zQZaV??X|I7XZ8KGW=LeWCRQfh&QK)pVRjfV6vE=4`A`j$F_xa&t*=)vmOn?j17V4l zx)0Wd=V0;amSol#8`ru6RQzFS>k!wt`9c=meB;kOlF!7IjiJ0=Q+F}VyK40Hlt7h~ zHc?Bp^fXhmgM3%I-Q}ybqw)CO*F6QIwqAuOpk+{&Q`i3cW~O$#@LAWHlY*A|dB$m6 z{y3f?X8;$V?V=-hCVv(1Pj->eRmWf61ri&o8HsAf$D@EQu-olr8QnqW9)gmbeLO0@ zcxC?;m4F-c){|c7xI9tjGtaqIG=>rs^U**(R2 zv%&Dg1TEP7mZVzmQ#7V5lgC&UEr z@YKLhv}fYyndD`I1%2wB<;S#p^fajENIP#B*n5MQ5?^^t0o4+n=9@I5NMOPitTr~&3I%^4~^%c zDgT%z9r974&*(#RIp8G(N24Nr@7|_fJWt>W?L*Cn*m&#+F8;I8o;B&QDX(WlQ#!Iz z{gbuHA11mMB!QP0xEJnkWjC|3ZT>w4r^^X&twmOw<7=QGgVwmE&e)0no+&|V#X^gN&a@_Sbjk)x%x79~snLks8q5lLUmZpK8J&e47 zFwOs(n$3?>?)#bH)YldIjryS@x%IA*&u8=%KG@b5`f4c;(EuSVdoM-ym4c6u${t!T zE-05kBda}AiG<`sY;?IgcdF~wl9UG0{BWV936YC+w~V}Tnpx1`2{y2MNekFm`S0dm zXhaF-&rofYMqnRi-)<+Ia(Z+ms{xzgSb1Jezu8qrvuoghnGR%Ik&|H8qmE6oh~2fw zOG8tRrZwj4M#E(Qw35Hf|DCesZoXl}9Hon8N+2}nzJ6|YHcx;snRE60x7-QC4SHIY zKWjNFU#}JADQB80b_)64lyIDaHOV{Ue-fdKitG^CT%-P^hDUECC~jBSQRd#}mlZVN zyVOO()&-Oz%mnz*QO1+cB~HhO&YTZyecGN>w%QDlLi1g{da{6~+y^ z+QG4{mSX2lE};%n6=P&4Bo0PlyFy%P%p^DCB+(eZ-w-f_y|RY{O?d*jd=z`=LdY{= zL0|loh0}~5n<4jvfYwPb1e(ibhFw;zoE^n9W>%wJEv$OD?U{CTc~`u7Mja`u1WcD+ zzhmTpOL6T+y!(?Tde{On<*I*CCwSBBSE6+NNNM&{^fn|we{b5!v!Ch7q1=Bi&Az{! zWmkcI(Zg3sXrzTDGv-}K{Gw}j#gpW+PCo3UW*QZ(9?)(PU$8AsxY}2gFB+T^TZpw^ z$R#KZwmHk?t2ca81>xv+vZ>iV6)V3rvBjURF?aq_tPNk_GVM}&3 zE3bw_@259HQj%T$W9gtj?_W;QMX@V?m~+zAy|;+T*?|11k;W{qyCkaEgMs`V@VVWM zi8J{fgiq%93OKRL`#l#dX0;DQlDzp`MisTZ0m|&=4P4F_P%FVBpsh+H_6yDrP_+S} zb$l%)2*KQzu0M==!FWm48%BFDNJBXt*#a(!?3v?e-p5d+V&$qC;ofL@xhWAd+wL1U-X@Y~{Coktv zHq%Tn{`r~gZjckUw@(afif{+o;B%u@T}Q^>{KdUs(gO=XeP!T%Eg%(THhH*#dPC7- z$Uup2SaJKlT$913ob*I~7J8V~vXo}U&*3T1NK4y-)V1TKK2g%;PkniB2{makMrK(& zqS{a%4}mtFIY;tppL1_Kef$}@B%8qdell? z6Vs*NK2lMTz46_S&q-^TMIWa1b6xbb`I4x{s$zWR*^9CFsB;-*bsO3`yoI73~xg;zP<%2Ak5tf$mC~v*{R+zcXj?$ zFNCJYH$5tEAYnKmOI14pYhO*ljlX=d*Pxm!cHpV?0kDA@vp?aX$l8 zwP5j`iNcYUK$t3~{Yg6jwsI|e5Nd$wnX$8}&Hlg-iTBO`=riCo)zc*ToOtZG50Yrk zogX5{$^bT@)t^KWYjd*paRvV>Ep%^wdp$LpK;yQgLz zJj#>{p!%!6U(}!VwuMj)AsJ{*(HRO?TOhAm>#M6FS>@cEE0+PA_2fa`Zcbms1yybRGt{n&{?Uw_#R&hgJ1sizw}t< z4?5cKm4?q&eH$w10y)tlEjH5}em7^oOMuQp>`fJ4~coB30l87T@HlNVybM~ z^Ztf6pB5gU)&p>i;3L4jk?7Ci@vHKkC{hE$Bhbuf);N_bkx-TOEf}_eQ*-_Lod551 zu4MYd30ER%hl{PMW6+xWA>_chL6*z;$p;{`{DTeVD7-*_|94}(h|T5OOkwy7qx(xU<#N@ML4^w+$hZaRMo1I7n=Px*%AUkKP?aGJDApt zz!dW0q2Hsehg}d7vt1hHZ()`aqh=Rg+aBqR^5^U1}Rf*dKFb zs^bH75u=BMUeMxz4KlP?Tv>Mq4+&6--e#PFJtz;fBR$IkLM(Oww*KY7Dv$Mjr~)vr zu}tj=&vv&mgZuz;$jw$MdL^I=jf|S*Zob%pI2KhFZ55H*5<=B*x~Q0|5eB{E1y9Gi zcmhtIym$}2&C#Py)=&+Y^tWP{AN|_#Z(CQ8uSS=kaC6mUw4w)RGOjG($1O|k)_~Eu zFLt9jJXGJbGJ&2EQ`q{Fs57^8S_ZjwA24oaw8L_;0)EMUK$tZdSo^YS9(6kCRCd)7 z66ou;_3OQJi`q_uT45l<>mTF>MT}gw?FfS2_+`Fsu$3U7qJO&pzWY`KfyMX{KcTP@ zF(8jz_JOVLoT9GuruM>U`s96m-<3M;;(ZN`A6_dsG5PEVV-9?d_p|@W*>hPJ4UTa* z;1as(5%v-c4in+Q6*|pB7t&((O%R3?FkP6u4F4~#WFy~opm1$gRD2@6_pnRf4dj&` zvA5qj?DsD3sTv0y7>Vu-)0U{pwigxgXj)Jv| zAt3wg-~^nmq`I{6#N7Kgz`Bma&wqwH#mm7bEc<*{Z)ky%FW5+1b&`fO=0lC?QCAud zMwz&2t~%^heG6Ht10}6TOJ1F19=z)Y(8$o&!iQ+P_Wq3@?)XvK;y=}aMuHVKD zO-beS9rtUUfYG(T#q&JE_+IW*9`L~S@(S+tYp(8Aw%aL@nr=8F=UQSV)tyQLe4?W!STaixXwLw_T^SkTf&NRTeJW(y)Au}{|HzG(c zVT(In#EaP_61-a-rVlDp=JNg_84i#P$1X%^@9%=leZ5G}my^(ahPvDe(&ZR>}K*H9-C9VQos^>zRv{sS5P}!MYpiWQz6Q(jNyEBQ^hoNvtd;ltp4$MNXLi&^I z%>L!)T0x8H#$r_>n`on2h`)J-NAweu@DLm7Cc?tQ`wSptuWV@k(Vpr`@a|3^lEB%d zjY$;Vcn2W5Q8&Xo;-&vYWo3Z!=sW70NNVbaC7zCq9`P8PaV0}B5*7=z@rsBHsPgO- zJbi7p{3$+xxAM_J^b(}aZsqq!6JA0}Jc6S=33C-7PF+4% z(RmDA_-?e^{uk2f2klarc&P&n=W8}ADo8XbyHy56EhX^->nvR;J{|&D{w+WMj0>Fz zjf4d)75Y^}`0Iq>#~kdJNQOhCA-(fT`JO6pdY@=M!==j8xLc=N zxK?n0A)3bJW3cI-&sD(+Alc9@h)h^wt2swoFyNS#XS+5ai}zr@*%jX`rM2_C!NJ*t;76y7pp6L?RJUX0Irr#zz$;Z23n&R%1r}S&oF2=$yfxje4NnxA}aY#EIR~J-Fum=9^yrw?5mz1 zn@3T1MY69$U6x;yp@%y;O~$We0H!y7qUYKL zFa8kdRP+hHXjmg3Ec>drVD)+#jI!iZ3(XgH!y3*49cesntxM2zA45sIeyc6mw8jkTgdoTZg#E(E|gCgx{5i~Dqg4L3jmgwUi&gJ!Vu&jM0 z#kVnGJ-vESI08Hi_ z&#Tlhdh;cklY6JudSFVn>#ZL)UP0=ku1|lJYc2u<2f{@BbO%$b3~VefI0x4gPra zcz&WpT$)lL*^+az9QS`WSe_@*{#K8jj;L8>E^>{xwSeQuccu=V*Nwcz6 zF;gZ(;D^?&Gg#y0|JlqK`{ob&kprhCf-w#3N^<(&ctHjL`L*I#LtSibl@a9bk~4Gq znM#Wj4_WB@?>S5YwWzva|$x)$b*vMVf52idqJ(gdF(8nYGa| z#xv!)EjF$v`(d3Bzy2eBCWSn+v@^h?%&T-D7RfVNbC@f6MaNn}sP4}@4c&yhqkJ4O zk;eggaCIm()J~`Lms1Q!Ott<4j>q=nogXs~k;)LUnkjWn_R=@rBgv&lQ|?^6E)GT! zyOb8r+j)DDC5-5+mQ(O}$S07~JecK&j`zD~bEFOo($TZITS;+@a+BBBS%;D^*!D+` z#TUOP3OFCFfhS~N(xi1$zi%#i#}JG>SN|m>ueV^TaRbS7WFLo_TWwvHQ~80S^xpfq zMd*ICMC3WqgGuyfg5Xl*t$&cyG1z*M+j%Z-D{Si{E7r=Rz8@?9v?o8Mk^8#&)q9e} z5Uo5GZOHBRdKCyWq5!=tfInh9$6z8jo@v6^CwMnX15TAM0_>TX{u_13l<4_O;ec5`a4+ls`+ zpuy68VVkfSPi^rNHd+CRC4?nfs)uLVdHu?1@ zg0aSy9;|JX|BE^scTpe?IkeYMnh=aOe1G!qiRJFrZ7g`HNN zmTvgK)?sBT_zjxhaRZWfmlr_*WhmW2H4Dlqo^lZFD^lC-e1Q{E+&{A`xWSTke@yRj zW$o=q(0GYXP-qD&fB+yCm2H2bL^s364nBz>=^ z3xwV4j<_Hy(Gvy+vZFp;EQ#W_F_t%_cX-_-o3@9NFd7yrzQ*Ltx-xqDwl*>GJz@HO z&eVKDU!ySKOENpdplYB0JVx5xP#n0J@JvkFiyF#c__g9GhgMby9sm%;s*l0GJAU@V zSmg`owvy%Sb`}4ia&Jzhw*N=c21-+>t&2jR&68i-jPv>d>=X3$g4!9c0w6i#zS5(w z05np7z+2VSuI7+vgmAFAHGeHZ&)W+fy`zj9(6bxvct3fM)|JE4_=9KDkddMbVZVcu zNf)CW8J}>+$e-+CgFPxSi8|HaGg8d$-A7B(UHDd1Ks6 z=pH84l6A1NfBE_}vd=*|uf-u~hk5xK>8oy~RaE9%Mggj}7yCwFbNeO|iyso}BgG>R zw}9HPk%mO-w^AY^oW^`QyhaEj$MSe68-3+mEj(jt@nlN;;6iaTaqw`Bu)MZlgNhGR zq(S&jY#^`|GJP`FL3ra%(m^LA;BLD81&|*q?E9V&&KN96_|(1Q=bnu-;BSqx{~{O* zz{|?(8=ue(+ce=H7hpDl$eluj|QKBCM0S*-M*QJue6NZaLoy z9>~pFTq6e7b*#p|bI!K?y$>o>ma`T@e^=w{n|{i!;~a83iYqXAjEa8|`1fb277OHz zxq|J*6J`eBk^*z?1rn5i_EQ)=x`%1KgA5LYULyI|lE1R0k#YAsy#Lk2-2&8YxO>v0 zD28Ca@y(%3S}Z21EdXF3ZD>EBS(=5)?thBMyH<+Fma?C>{nFyF16EDHK$e{=a^x(( zR0E){!ALfG-m^_1qW+TOJ)vkdq^mB*fMJOO0IFAu8^?whpkgOTnUIpHr*pI*fG|_a zCSYI90#`hKzA4(H69`X~e07`msY~$Qb8BcfXYvodMWx#p?Jf#-ZyH`vO_(QJ8+bf# zlWOMyIzE`Tp^c7X@QV()D7<||=1R>kbr+!L{>KSdvr6v{r)~=(=>T-ig%q75tM|SC zyQ8xSG=1A1S(|1oXj44JJiWd?J3EQkQ;r{&`NVMaumZqf49Fyja$qWqf**ZdbM$We za{zE%M<&Yw4oRtPu8rPO>@Ux{I_h<(YuH)L>{`sS!QEil0I+NQaGVhXW0o;yj;o?lJ_T1(z0?9jROWD4Ui0N3Igcwc9<#h(k|^m;Eb8$ zB`-lU+OecWH!=K?97{i$CM2HR-Ml@!^b3$2KlWAnNMtWu0gyA}plJP)woz(J%Y>h& z_QX>kBdY|b9TTNC%a;tIfk08!CW+7e)9uk9+YPq0M{ie;mBeNU6h}vN%sD!x{H2P( z)a)fPtgpYAIkpO2{cf%5Ca$)K;@pIs6_US;Amz|NvCA<7TiN#|WAQ&OU7xO9?AfT0 zr~S6aCe)7@Rn6C>CsWh7v9`7~H-#H*G1!xdx9b8xQbNByE9vy1@q@EIU`kou=qdGF zajqov$mFl?VvC3%v|(mhmrR}{GL<C2V z0cTe_`m&sg5MI?2|LxkEo9Y@34o(jgB4~8sKNVSw8Srp}rVLs6ou5V-wNN)g^w$|U zPIhr{XoPP6F91HYCj~3R`CK&jm`z2ks@!B)S9>aFm=*P7cyXO!(Pf%kpbw!wkJ{dV z5Yt@;jW>Hz0_Lv24tzI{30K!ZS--kGt3T~H?_PB9NQU%gF|cuceyKk?8GFhfu%R{} z-6>!KHz=?jACJI8E41~0ONVd29{&5mg%Ki1^d79=Ee z^&0y8KId;7YMJ~pP8c|aBUi4^d4v^u>+@bj(Hg{;KRk*JWj|2PY(NFXu4# zffSRm0q1E2dVq1tCMP{o#V=kV@ILgFh_DDF*wmxH5^}6?H+a3Rs*T9_&EBHEyQ+>@ zz;XWuv#o%z2**w)V)2E+yFjXmZ>#5o80K)tQb2L?AJU1jzE|^8evvQ4JkW+VaK>NI zOG~$%ZT!sG{^&=X<_~#`fV&=xHWd(S+jS(@Ny4=U7agFvy47f=Hsd*m z%?WJSNJ7`qk&UIz1Pr^}_+#dp8ar+8ODau_@C7g!sGW6LakRrlD!7kUc&W;Z&i79! zy%$pMTFX~%W^VPn3o>!$4EY$SMran799k6e^eNe$W&&DM%v=V$P<&1BMj$1mhJRaQ z5yao2cX*!Vey{x7G4*3vF+61;$2OG^zK26xEBy%A3DC}e&Ik(z<;Sj32MA&$deWzc)QB z?DtAUysa?rwfwt6`hPWZmSIt~UEe1}x;v$j4r!!@l5z;?1|3Ri1?d=Ca*%G296}l? zrIZ-DL8L)Ike2Y`*)v?valH5Ye!D-tpZLIDd!2jjbN=g`HEaD~M8xaMJHXE>l=?u! zj<7$~fg<1x3$Uu)1lq)nT}$k&+hS}EUt1$Qi#%80VZA;ynjNGQb(~uo3-@=pFyy4V zfJrot6^3_*&?&#;i>s>#28K@J1?{Ho!tH|f-;Z*Ha-r-ZEMR~+V4Y$_4BR;xQ6I>P z$$Avw=-UD!`gI5tDvWWj?bD(Jf!Vt+I6!@&>1G}nugs?bG@k;tU?Kz<=>TVu5PrdR z8TVZ#wx>!qvdlYN!odhj3gCYt=*R)f{5)kv=b9c2U_jBzeINK<;;w=k0yWS7Rf;_7 zE*;BulH(^Q0zJ}XWEMkgwZNnYBXD9u8ir^iN5-N<9Uv%s!Z~o^XQ0JacVbb+7?^A< z00zvk#c6&-Lo-o>Ft`)&`@obU2Q4sS`hQ=CFJAlq-tXNZ%0*#ub8KyXUN4CDzxOLZ zmG|sH=Pvbfs$U>y)Yms!qu{EbT*j~`z(Vi!iZ56E9_UwIyXQ7^4NMo^ED5nka=jiy zl{qM4z#3JC^>H)DsihI)3ZB?JTgEKc9*kix!tY&2hs|xtP|6+E-E4VDg#Z55HVTbd zh+vU=td5@N!>oJs5NMOlZ!56Ljh7Y?>fs6jx2#vm!hwe6PC~|h@8<+P7C=~YLGV(` zw?~cXa0xjSuM@^PN&h_>C#UFyXq)`&c=Is83P6Lybaw72VCB~{6=%ni5p5HJHD>k?2F?@auwYbuHr?0fQY;8NMmygFRrz>MaH zI~&<9@B;-<;0byQ3w^T{nq=W2IgKXgUK{0R1oY?1&$lc!Li=TWD%Q?l$dfX7F7*&~ zD#+vZ_g^cAgWEqgogJk~JUt`9Rg2ajV$ob7M<+O21WMs+lM$OS+&psuPox#!ftA(# zUQ1h=cQRfn;hvKRncoBeef9Z1x5}*7SA$u_6OMQp>O?F+>E~bTiw}gFjM}scjfY|` zX5x8I;%7XjXxkfgbFkhACM0Nft>~$&SlGxr;VvsP1RitZg2x590tQxn=v6QOSTy>Q zj3}=BZQ(22@fJ*M{jr{MlRSC`GcneTn!OAVeMA6}CuQ97jx;ulz@N2Fv{<~bYbj)A zPAIz+&G5m#{=>fEb2*Up*dosNDKl_SyP0S`S)3u#KWm4nQ6)ZkH(C58OkR2Z)oEJt zZ`B0fRL?bmk3XK04}6Icd({f5G$yvxa8prOQSb?)tvhx_F+#MD;ru_c;T)0cafic` zIA>w`3?coh>MjJ#11#KX$F0XEoS4r?6UC_0*y%OplNg@z%WaRyE`5klz`KezT5dQk z#Tjk1PusB0{hh^wG0-*7)2A&r+m@6Ai6lG~9cy`7y;Wyfqd`;(-TUB4>|M^1*xo`| z)9(25P*sfU(Lj>vCbJDGI(ERg#INfO&%b@cwI`hxf3nW~m-}x`tIpf`q_P))SsLr4 zHH$lkH`)+YHi3a8+cRcI?2wHgFS&lLv~76i6!2zR(CQXNCyLPvmRP119XZCK6MnA@ zfI7HkF<=bDF8)rtAR)}w($6MOlnPJl_1DWemP&r?sJ_*-3yy{98v;S^u>cq;zac1NtK6+pL25>2?J>rg>n|^k z#=-9>>8yqY4s>=zO~QQ@NI354kuwvE0u$Y1IDE<4V)zdxHn^1%a z^mFEOSk}w;AIPfubvGFKU*IrOhomTX6i5P7ygCnkvZ*kDnxP{l)x`Jv=ywdDFSu(i zApipi|Mxj>1&RUG#SSZ)02n}QHW5a53?Mi56-Ot40i+)$?u?6K0M$nX*bD-r>h8)_ z9{`GssrBr8?kF?rW0_GSBy8xDryGkBK0pA%n{7Vw}IoqJTUZzM!gJYud# zbG^NH#1oZ|-FIpfszhkpSd3wv%8I>7y-|}4QApqD_>jq-=TF9+d6?F1NEgCwNZn~B zGQVn+wvUG=_Dja|A22nW=FdL7+_GAJ6%U09%&?t$ z+2`SY_7o{u=MptFhP@*^*{Tm;a6M2cY#!L%?(!_jaqAWTNx}792Dd&D-u>d`OR72+ zG>dspA08wtLqJQ`eY0Am^l!jhZ-g~Okq%{FoxRsCe)2`O%A0tQaE7Z zFisKR>ONSUosds4>E8dmHmiGE@X?9i9LgF|K4MlG7AAbn9H-k9CivIcLwkuH;{GWfJbSLOQDu&R{W5Ol@+mU!G&rDiOr%^WK!d2%~Z2%lHA z@(;m%Ms1IEC$>p$`=bKD{ zp98%eVbyUWoUBAZ5HVci)t*)>l)a`A8X|@IgYy;9M{LtWHfR7qN|}c@ABsiB|JFiYWRR?CvCB8Qzb*&Ug++*^pf|g8>xA*{0SCJ+D#&LS)qYza{+i_j!%}# zU*&EsKKQkzZog_`A_wlVfF2rjKRA!%XzS#Lr~dvVlZ@n8t%tD)9=wej{4BE9{42$@!MQj2 zMc0d;)`e$LmzrHuZL2Q{SY%TeO(-S4Y6(Pc=Fb9}{U|L>I@T z^^<(m)qz9_;jTV?;y4RB$qh+wb?qBNTA#B@2OKyfh~$^QZVO*juc#fD-_5zS@i|_}keroQ+?J%LUf@>D z43j$rnk;GV7xU2DJ1Ape_G8NDPO3M92$hP=;4}T3&jNNb}eJ(}1&0 z3B8Sb3(O8mDif;gy7k?Mr{3T{&y|F`Qs;bIP2|gM2C%K*q*6|O-^jDW3(BWTtg2*O z2mWyV#MjRH^-p2r;|s_j8Qq*yA3oWID`d5lEUdji>0IHav3<{BtXoE37T5lMY{k*=JAHhJVqq7o8&9kGIYds*6g1u zi-J*U7h6GX59z`^+YCli`kl3a{Z0#(+Is2IqQ~c9(!Lk>g@13S`vw(2iWb^hQho^? z;9OVcl?~3}P>XDnuNo`a3MUD2GOp3HLzemccIu;HLVK344RzWOHt9>F&e^( ztsyg5vnyAJ$%(q80hxr6JVpA&X-XiY->(nN%1@I6xh278-`$5pWAgb>Ap&=1$US7J zM?Bh|_#&rhyLB}{GB3SVxS6R8wUeR5Cv7NTchurC#Q#=TR=@DM#eqK~>gi}exk{%{ z))_2gwv(r%kB$@=tmRLzNeH(&dfJiF(`DA8`La^*1Wc8Cdkl5*Zb=ITx<|hTio9yh z5#*9Xw_ai6t+^l^O0nly-meNQxkUA>attazr`cO&(5_Y#R|)1bI^!q z0$VZbgqDIGs(wu{9Z_z^P?k>>nXGQ{VhjjHH*HM3JRiD`@HQO(Lp{I%{b=&2q z7d+tBr@Y$VFkp(a@AsFC7EU-mo*d70x2r4B`dB(slXr1nZlJ7P3gUs)s%g@n1sx8# zwJmjSgOH5#8T9Up4!N35xBezgF12M+q~}TI^jdR7r&mUNS=`iCT}-=&o~mvv{4d*T ztvyr^Z_36u(z#fnos|FOP)j*+^=jd+)ELV{KIO*ul5o_HS&k)Bc7VEjqVS zDlz0@;gQ!D<-t``2X;SW`-1zHLd8?qJ0uE6oG{7h(Nm*$2tNIo`jMgG+0$NLs|uyo z9P_T9Jx>?)QQ~E5qkPkzy-v}5t8DP|k!x2pm)FOV-M6nshQr>pO7kmD^EC7IR96lc zHcPqc>1N{leGKM>|5`G;#tq{z@9Tx7OWc~wj4WXvZM^2>X3rIKHndIlj;>Y{{t?ik`OOqdjpfgX3*&Hi z{Sv!fI|HPrITGD zRIAB7&$`CjCo_sVA74%zELl~Zw>&Z(yd1<88=Iur-pq^;R~_Qo>-HTYXngr~vRsFG zevx4vBQ9%Z!ht2lA4p=o;Th)E^NqUwKIhQ{O*B^bmwP9}`U6gXIT?B6qY2Y{WB^hn z|4;D)Vqtly#IF3K-XcWun?^Jh-B^|R`GA?r4PZR`ICc+Hr)XoH+lSFoYMmogqLr~u zs}oM$WO@k4)~BU6N;h41>FAVs-$Qe-mWsVM(pK5`iOiSQYsTt8IQqW880Fs!d|pYT zVRtb~yo2*j3;N_3tu7XQuVts#z3iwAW3)xYLD2oA(n1b81j9W)TOLVgI&8-n*^q+b`?TTt=@zg1-IC z2i*;5C}8Cs3o|fTEV|f`33b5M$0Q{qQbX%yj0eZ(?jmWOcPz{uu4|4OevL{b)N+qX zNO9^F37WfW*s^8cG@Fxb>m3Vo41COv82f}mdeY(Dd#pu_PU7u52jP2x@^;6yG@}&U-w;kWDoy=iQ`0& zZc%6;8KrRCDqxOLnJZxiuBOWx--2Uj#eXK$!Z-zfp|87O9<}9k*h67gJ7zdiqWoL3 zonsO(7{y(f1Q?2IPqwC{l&2o|JoTmH83zy7N79cDz@Bm zZhQ)h=^F+8~yNVH&&0XBZ66?4!Tzc|C8c$0STw2G&?q?BdhyD3aM1l~-#~dQS65 zlCr8pPU;iqrRi4>JAo+1@#xp(S7eAzNmf#fKxU-6ecXa1I$g(iQMr*t~VmQ0G zHq=AW1S`@(RO^JHu`06E!CGx_4uHLxGry&JR9=h1!f_1DuWXg{8+_xrihzvF+}7Xb8QP_pgvul1BH`C8Yaoiq(x-gJ$_;3$sF@?V?RlfF7iX7HHdoNtLWjvHHnt%{M?y^6} z|FO#~9tiY8s=w!B_mG&b%18^nGl;zi+PNXeAKgFq7Kv_21#OxP-_t?qM58G4v=-Lj zXI+8ImnI9-QudriJ;sUEBj{#&kM~IZ_?pTR@lVO|Bl&I-_35|=cNpN-9NGioG+Bik zkx%S;nawXT2eEPz^j-z%;UM0aO@KcKCOguucO#wnp`1MWK4bo%dh>>;lBs`)>l1UI zoT~}!=V&hcvW~Z*n>Ez(5mI4{>@LZ}B{IfAd=z z5f!2@IAQ(Sk88x}2%)a&&g-$un**IX?yej&%O07RpAc(UCw8sRkkM2}Exc8yJ}B&x zU5z?QMj`!@4A?FxJy9q#IUO-l0swvaahsa|s}c8c#A25(ogA1Cg2ht>k^34cyO_{S zl|z3q(J?T_f^larpcN=R4A!aN`Tq4pH z)e@u70eu@q`2m#NqE_`U_;~3~Zjn+>8`de=hyJ%1kXvL#8-+j!Qx(OfJGn*5R>^%< zc7}LBqET{-awSE!7-)di+i@qi$^cxN-#WBxks0l3;7)Fl6zB^Lp|?DIQ8M5Of~r_l zR4H(L(wQ*w^NJ&1_3NR$iP?=&mr4R@?wcuc!KO3?kgCG`yyx1iZ$;6Sb+o1u@fj|`GloK@|qq<2#fj6M((H>`f;Q`rldNBJyw6z zAb{G_5g6#ilS*j4IZREonkGnZnZi4<`V2rViOm`H8t!P=Vg_}u0vU`9 zm19vtd`G)py34W1H}M^VGwKr|@Ki5RITjhie`(h(C#W2Y+#^aM!I3{%fR}!kW0B#x zqg?|p9sd8@OJ{c?M`Fysv0#)7Q?e7_pkV|9Nc8{u&kkrMi0WNrsJ= freq_: - dis_feat_set.add(feat) - - # Create a dictionary for continuous and discrete features - feat_dict = {} - tc = 1 - # Continuous features - for idx in continuous_range_: - feat_dict[idx] = tc - tc += 1 - for feat in dis_feat_set: - feat_dict[feat] = tc - tc += 1 - # Save dictionary - with open(dir_feat_dict_, 'wb') as fout: - pickle.dump(feat_dict, fout, protocol=2) - print('args.num_feat ', len(feat_dict) + 1) - - -def preprocess(input_file, - outdir, - ins_per_file, - total_ins=None, - print_freq=None): - train_data = os.path.join(outdir, "train_data") - test_data = os.path.join(outdir, "test_data") - aid_data = os.path.join(outdir, "aid_data") - raw_data = os.path.join(outdir, "raw_data") - if not os.path.isdir(train_data): - os.mkdir(train_data) - if not os.path.isdir(test_data): - os.mkdir(test_data) - if not os.path.isdir(aid_data): - os.mkdir(aid_data) - - if print_freq is None: - print_freq = 10 * ins_per_file - - get_raw_data(input_file, raw_data, ins_per_file) - split_data(raw_data, aid_data, train_data, test_data) - get_feat_dict(input_file, aid_data, print_freq, total_ins) - - print('Done!') - - -if __name__ == '__main__': - preprocess('train.txt', './', 200000, 45000000) diff --git a/PaddleRec/ctr/deepfm_dygraph/network.py b/PaddleRec/ctr/deepfm_dygraph/network.py index e954d1b8..1d4b7e02 100644 --- a/PaddleRec/ctr/deepfm_dygraph/network.py +++ b/PaddleRec/ctr/deepfm_dygraph/network.py @@ -1,10 +1,9 @@ import math +import paddle -import paddle.fluid as fluid -from paddle.fluid.dygraph.nn import Linear, Embedding -class DeepFM(fluid.dygraph.Layer): +class DeepFM(paddle.nn.Layer): def __init__(self, args): super(DeepFM, self).__init__() self.args = args @@ -14,9 +13,9 @@ class DeepFM(fluid.dygraph.Layer): self.dnn = DNN(args) def forward(self, raw_feat_idx, raw_feat_value, label): - feat_idx = fluid.layers.reshape(raw_feat_idx, + feat_idx = paddle.fluid.layers.reshape(raw_feat_idx, [-1, 1]) # (None * num_field) * 1 - feat_value = fluid.layers.reshape( + feat_value = paddle.fluid.layers.reshape( raw_feat_value, [-1, self.args.num_field, 1]) # None * num_field * 1 @@ -24,31 +23,31 @@ class DeepFM(fluid.dygraph.Layer): feat_value) y_dnn = self.dnn(feat_embeddings) - predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn) + predict = paddle.nn.functional.sigmoid(y_first_order + y_second_order + y_dnn) return predict -class FM(fluid.dygraph.Layer): +class FM(paddle.nn.Layer): def __init__(self, args): super(FM, self).__init__() self.args = args self.init_value_ = 0.1 - self.embedding_w = Embedding( + self.embedding_w = paddle.fluid.dygraph.nn.Embedding( size=[self.args.num_feat + 1, 1], dtype='float32', padding_idx=0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.TruncatedNormalInitializer( + param_attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.TruncatedNormal( loc=0.0, scale=self.init_value_), - regularizer=fluid.regularizer.L1DecayRegularizer( + regularizer=paddle.fluid.regularizer.L1DecayRegularizer( self.args.reg))) - self.embedding = Embedding( + self.embedding = paddle.fluid.dygraph.nn.Embedding( size=[self.args.num_feat + 1, self.args.embedding_size], dtype='float32', padding_idx=0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.TruncatedNormalInitializer( + param_attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.TruncatedNormal( loc=0.0, scale=self.init_value_ / math.sqrt(float(self.args.embedding_size))))) @@ -56,32 +55,32 @@ class FM(fluid.dygraph.Layer): def forward(self, feat_idx, feat_value): # -------------------- first order term -------------------- first_weights_re = self.embedding_w(feat_idx) - first_weights = fluid.layers.reshape( + first_weights = paddle.fluid.layers.reshape( first_weights_re, shape=[-1, self.args.num_field, 1]) # None * num_field * 1 - y_first_order = fluid.layers.reduce_sum(first_weights * feat_value, 1) + y_first_order = paddle.reduce_sum(first_weights * feat_value, 1) # -------------------- second order term -------------------- feat_embeddings_re = self.embedding(feat_idx) - feat_embeddings = fluid.layers.reshape( + feat_embeddings = paddle.fluid.layers.reshape( feat_embeddings_re, shape=[-1, self.args.num_field, self.args.embedding_size ]) # None * num_field * embedding_size feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size # sum_square part - summed_features_emb = fluid.layers.reduce_sum( + summed_features_emb = paddle.reduce_sum( feat_embeddings, 1) # None * embedding_size - summed_features_emb_square = fluid.layers.square( + summed_features_emb_square = paddle.square( summed_features_emb) # None * embedding_size # square_sum part - squared_features_emb = fluid.layers.square( + squared_features_emb = paddle.square( feat_embeddings) # None * num_field * embedding_size - squared_sum_features_emb = fluid.layers.reduce_sum( + squared_sum_features_emb = paddle.reduce_sum( squared_features_emb, 1) # None * embedding_size - y_second_order = 0.5 * fluid.layers.reduce_sum( + y_second_order = 0.5 * paddle.reduce_sum( summed_features_emb_square - squared_sum_features_emb, 1, keep_dim=True) # None * 1 @@ -89,7 +88,7 @@ class FM(fluid.dygraph.Layer): return y_first_order, y_second_order, feat_embeddings -class DNN(fluid.dygraph.Layer): +class DNN(paddle.nn.Layer): def __init__(self, args): super(DNN, self).__init__() self.args = args @@ -101,25 +100,29 @@ class DNN(fluid.dygraph.Layer): self.init_value_ / math.sqrt(float(10)) for _ in range(len(self.args.layer_sizes)) ] + [self.init_value_] - self.linears = [] + self._layers = [] for i in range(len(self.args.layer_sizes) + 1): - linear = Linear( - sizes[i], - sizes[i + 1], - act=acts[i], - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.TruncatedNormalInitializer( + linear = paddle.nn.Linear( + in_features=sizes[i], + out_features=sizes[i + 1], + weight_attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.TruncatedNormal( loc=0.0, scale=w_scales[i])), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.TruncatedNormalInitializer( + bias_attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.TruncatedNormal( loc=0.0, scale=self.init_value_))) + #linear = getattr(paddle.nn.functional, acts[i])(linear) if acts[i] else linear + if acts[i] == 'relu': + act = paddle.nn.ReLU() + self.add_sublayer('act_%d' % i, act) self.add_sublayer('linear_%d' % i, linear) - self.linears.append(linear) + self._layers.append(linear) + self._layers.append(act) def forward(self, feat_embeddings): - y_dnn = fluid.layers.reshape( + y_dnn = paddle.fluid.layers.reshape( feat_embeddings, [-1, self.args.num_field * self.args.embedding_size]) - for linear in self.linears: - y_dnn = linear(y_dnn) + for n_layer in self._layers: + y_dnn = n_layer(y_dnn) return y_dnn diff --git a/PaddleRec/ctr/deepfm_dygraph/train.py b/PaddleRec/ctr/deepfm_dygraph/train.py index 97a7626c..9f3d5799 100644 --- a/PaddleRec/ctr/deepfm_dygraph/train.py +++ b/PaddleRec/ctr/deepfm_dygraph/train.py @@ -3,9 +3,8 @@ from __future__ import print_function import os import numpy as np -import paddle.fluid as fluid +import paddle import time -from paddle.fluid.dygraph.base import to_variable import logging import data_reader @@ -19,134 +18,135 @@ logger = logging.getLogger(__name__) def train(args): if args.use_gpu: - place = fluid.CUDAPlace(0) + place = paddle.CUDAPlace(0) else: - place = fluid.CPUPlace() - with fluid.dygraph.guard(place): - deepfm = DeepFM(args) - - train_filelist = [ - os.path.join(args.train_data_dir, x) - for x in os.listdir(args.train_data_dir) - ] - test_filelist = [ - os.path.join(args.test_data_dir, x) - for x in os.listdir(args.test_data_dir) - ] - - train_reader = data_reader.data_reader( - args.batch_size, train_filelist, args.feat_dict, data_type="train") - test_reader = data_reader.data_reader( - args.batch_size, test_filelist, args.feat_dict, data_type="test") - - def eval(epoch): - deepfm.eval() - logger.info("start eval model.") - total_step = 0.0 - auc_metric_test = fluid.metrics.Auc("ROC") - for data in test_reader(): - total_step += 1 - raw_feat_idx, raw_feat_value, label = zip(*data) - raw_feat_idx = np.array(raw_feat_idx, dtype=np.int64) - raw_feat_value = np.array(raw_feat_value, dtype=np.float32) - label = np.array(label, dtype=np.int64) - raw_feat_idx, raw_feat_value, label = [ - to_variable(i) - for i in [raw_feat_idx, raw_feat_value, label] - ] - - predict = deepfm(raw_feat_idx, raw_feat_value, label) + place = paddle.CPUPlace() + paddle.disable_static(place) + deepfm = DeepFM(args) + + train_filelist = [ + os.path.join(args.train_data_dir, x) + for x in os.listdir(args.train_data_dir) + ] + test_filelist = [ + os.path.join(args.test_data_dir, x) + for x in os.listdir(args.test_data_dir) + ] + + train_reader = data_reader.data_reader( + args.batch_size, train_filelist, args.feat_dict, data_type="train") + test_reader = data_reader.data_reader( + args.batch_size, test_filelist, args.feat_dict, data_type="test") + + def eval(epoch): + deepfm.eval() + logger.info("start eval model.") + total_step = 0.0 + auc_metric_test = paddle.fluid.metrics.Auc("ROC") + for data in test_reader(): + total_step += 1 + raw_feat_idx, raw_feat_value, label = zip(*data) + raw_feat_idx = np.array(raw_feat_idx, dtype=np.int64) + raw_feat_value = np.array(raw_feat_value, dtype=np.float32) + label = np.array(label, dtype=np.int64) + raw_feat_idx, raw_feat_value, label = [ + paddle.to_tensor(data=i, dtype=None, place=None, stop_gradient=True) + for i in [raw_feat_idx, raw_feat_value, label] + ] + + predict = deepfm(raw_feat_idx, raw_feat_value, label) # for auc - predict_2d = fluid.layers.concat([1 - predict, predict], 1) - auc_metric_test.update( - preds=predict_2d.numpy(), labels=label.numpy()) + predict_2d = paddle.concat(x=[1 - predict, predict], axis=1) + auc_metric_test.update( + preds=predict_2d.numpy(), labels=label.numpy()) - logger.info("test auc of epoch %d is %.6f" % - (epoch, auc_metric_test.eval())) + logger.info("test auc of epoch %d is %.6f" % + (epoch, auc_metric_test.eval())) - optimizer = fluid.optimizer.Adam( - parameter_list=deepfm.parameters(), - regularization=fluid.regularizer.L2DecayRegularizer(args.reg)) + optimizer = paddle.optimizer.Adam( + parameters=deepfm.parameters(), + weight_decay=paddle.fluid.regularizer.L2DecayRegularizer(args.reg)) # load model if exists - start_epoch = 0 - if args.checkpoint: - model_dict, optimizer_dict = fluid.dygraph.load_dygraph( - args.checkpoint) - deepfm.set_dict(model_dict) - optimizer.set_dict(optimizer_dict) - start_epoch = int( - os.path.basename(args.checkpoint).split("_")[ - -1]) + 1 # get next train epoch - logger.info("load model {} finished.".format(args.checkpoint)) - - for epoch in range(start_epoch, args.num_epoch): - begin = time.time() - batch_begin = time.time() - batch_id = 0 - total_loss = 0.0 - auc_metric = fluid.metrics.Auc("ROC") - logger.info("training epoch {} start.".format(epoch)) - - for data in train_reader(): - raw_feat_idx, raw_feat_value, label = zip(*data) - raw_feat_idx = np.array(raw_feat_idx, dtype=np.int64) - raw_feat_value = np.array(raw_feat_value, dtype=np.float32) - label = np.array(label, dtype=np.int64) - raw_feat_idx, raw_feat_value, label = [ - to_variable(i) - for i in [raw_feat_idx, raw_feat_value, label] - ] - - predict = deepfm(raw_feat_idx, raw_feat_value, label) - - loss = fluid.layers.log_loss( - input=predict, - label=fluid.layers.cast( - label, dtype="float32")) - batch_loss = fluid.layers.reduce_sum(loss) - - total_loss += batch_loss.numpy().item() - - batch_loss.backward() - optimizer.minimize(batch_loss) - deepfm.clear_gradients() + start_epoch = 0 + if args.checkpoint: + model_dict, optimizer_dict = paddle.fluid.dygraph.load_dygraph( + args.checkpoint) + deepfm.set_dict(model_dict) + optimizer.set_dict(optimizer_dict) + start_epoch = int( + os.path.basename(args.checkpoint).split("_")[ + -1]) + 1 # get next train epoch + logger.info("load model {} finished.".format(args.checkpoint)) + + for epoch in range(start_epoch, args.num_epoch): + begin = time.time() + batch_begin = time.time() + batch_id = 0 + total_loss = 0.0 + auc_metric = paddle.fluid.metrics.Auc("ROC") + logger.info("training epoch {} start.".format(epoch)) + + for data in train_reader(): + raw_feat_idx, raw_feat_value, label = zip(*data) + raw_feat_idx = np.array(raw_feat_idx, dtype=np.int64) + raw_feat_value = np.array(raw_feat_value, dtype=np.float32) + label = np.array(label, dtype=np.int64) + raw_feat_idx, raw_feat_value, label = [ + paddle.to_tensor(data=i, dtype=None, place=None, stop_gradient=True) + for i in [raw_feat_idx, raw_feat_value, label] + ] + + predict = deepfm(raw_feat_idx, raw_feat_value, label) + + loss = paddle.nn.functional.log_loss( + input=predict, + label=paddle.cast( + label, dtype="float32")) + batch_loss = paddle.reduce_sum(loss) + + total_loss += batch_loss.numpy().item() + + batch_loss.backward() + optimizer.minimize(batch_loss) + deepfm.clear_gradients() # for auc - predict_2d = fluid.layers.concat([1 - predict, predict], 1) - auc_metric.update( - preds=predict_2d.numpy(), labels=label.numpy()) - - if batch_id > 0 and batch_id % 100 == 0: - logger.info( - "epoch: {}, batch_id: {}, loss: {:.6f}, auc: {:.6f}, speed: {:.2f} ins/s". - format(epoch, batch_id, total_loss / args.batch_size / - 100, - auc_metric.eval(), 100 * args.batch_size / ( - time.time() - batch_begin))) - batch_begin = time.time() - total_loss = 0.0 - - batch_id += 1 - logger.info("epoch %d is finished and takes %f s" % - (epoch, time.time() - begin)) + predict_2d = paddle.concat(x=[1 - predict, predict], axis=1) + auc_metric.update( + preds=predict_2d.numpy(), labels=label.numpy()) + + if batch_id > 0 and batch_id % 100 == 0: + logger.info( + "epoch: {}, batch_id: {}, loss: {:.6f}, auc: {:.6f}, speed: {:.2f} ins/s". + format(epoch, batch_id, total_loss / args.batch_size / + 100, + auc_metric.eval(), 100 * args.batch_size / ( + time.time() - batch_begin))) + batch_begin = time.time() + total_loss = 0.0 + + batch_id += 1 + logger.info("epoch %d is finished and takes %f s" % + (epoch, time.time() - begin)) # save model and optimizer - logger.info("going to save epoch {} model and optimizer.".format( - epoch)) - fluid.dygraph.save_dygraph( - deepfm.state_dict(), - model_path=os.path.join(args.model_output_dir, - "epoch_" + str(epoch))) - fluid.dygraph.save_dygraph( - optimizer.state_dict(), - model_path=os.path.join(args.model_output_dir, - "epoch_" + str(epoch))) - logger.info("save epoch {} finished.".format(epoch)) + logger.info("going to save epoch {} model and optimizer.".format( + epoch)) + paddle.fluid.dygraph.save_dygraph( + deepfm.state_dict(), + model_path=os.path.join(args.model_output_dir, + "epoch_" + str(epoch))) + paddle.fluid.dygraph.save_dygraph( + optimizer.state_dict(), + model_path=os.path.join(args.model_output_dir, + "epoch_" + str(epoch))) + logger.info("save epoch {} finished.".format(epoch)) # eval model - deepfm.eval() - eval(epoch) - deepfm.train() + deepfm.eval() + eval(epoch) + deepfm.train() + paddle.enable_static() if __name__ == '__main__': diff --git a/PaddleRec/ctr/din/README.md b/PaddleRec/ctr/din/README.md index ea8585c0..e3ca1cfe 100644 --- a/PaddleRec/ctr/din/README.md +++ b/PaddleRec/ctr/din/README.md @@ -1,21 +1,6 @@ # DIN -以下是本例的简要目录结构及说明: - -```text -. -├── README.md # 文档 -├── train.py # 训练脚本 -├── infer.py # 预测脚本 -├── network.py # 网络结构 -├── reader.py # 和读取数据相关的函数 -├── data/ - ├── build_dataset.py # 文本数据转化为paddle数据 - ├── convert_pd.py # 将原始数据转化为pandas的dataframe - ├── data_process.sh # 数据预处理脚本 - ├── remap_id.py # remap类别id - -``` +models/PaddleRec只是提供了经典推荐算法的Paddle实现,我们已经开源了功能更强大的工具组件[PaddlePaddle/PaddleRec](https://github.com/PaddlePaddle/PaddleRec) 打通了推荐算法+分布式训练全流程,并提供了高级API,在单机和分布式间可以实现无缝切换。后续我们将在[PaddlePaddle/PaddleRec](https://github.com/PaddlePaddle/PaddleRec) Repo中发布新的模型和功能,models/PaddleRec不再更新维护。 ## 简介 @@ -26,104 +11,3 @@ DIN通过一个兴趣激活模块(Activation Unit),用预估目标Candidate AD 权重高的历史行为表明这部分兴趣和当前广告相关,权重低的则是和广告无关的”兴趣噪声“。我们通过将激活的商品和激活权重相乘,然后累加起来作为当前预估目标ADs相关的兴趣状态表达。 最后我们将这相关的用户兴趣表达、用户静态特征和上下文相关特征,以及ad相关的特征拼接起来,输入到后续的多层DNN网络,最后预测得到用户对当前目标ADs的点击概率。 - - -## 数据下载及预处理 - -* Step 1: 运行如下命令 下载[Amazon Product数据集](http://jmcauley.ucsd.edu/data/amazon/)并进行预处理 -``` -cd data && sh data_process.sh && cd .. -``` -如果执行过程中遇到找不到某个包(例如pandas包)的报错,使用如下命令安装对应的包即可。 -``` -pip install pandas -``` - -**Windows系统下请用户自行下载数据进行解压,下载链接为:[reviews_Electronics](http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz)和[meta_Electronics](http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Electronics.json.gz)。** - - -* Step 2: 产生训练集、测试集和config文件 -``` -python build_dataset.py -``` -运行之后在data文件夹下会产生config.txt、paddle_test.txt、paddle_train.txt三个文件 - -数据格式例子如下: -``` -3737 19450;288 196;18486;674;1 -3647 4342 6855 3805;281 463 558 674;4206;463;1 -1805 4309;87 87;21354;556;1 -18209 20753;649 241;51924;610;0 -13150;351;41455;792;1 -35120 40418;157 714;52035;724;0 -``` - -其中每一行是一个Sample,由分号分隔的5个域组成。前两个域是历史交互的item序列和item对应的类别,第三、四个域是待预测的item和其类别,最后一个域是label,表示点击与否。 - - -## 训练 - -具体的参数配置说明可通过运行下列代码查看 -``` -python train.py -h -``` - -gpu 单机单卡训练 -``` bash -CUDA_VISIBLE_DEVICES=1 python -u train.py --config_path 'data/config.txt' --train_dir 'data/paddle_train.txt' --batch_size 32 --epoch_num 100 --use_cuda 1 > log.txt 2>&1 & -``` - -cpu 单机训练 -``` bash -python -u train.py --config_path 'data/config.txt' --train_dir 'data/paddle_train.txt' --batch_size 32 --epoch_num 100 --use_cuda 0 > log.txt 2>&1 & -``` - -值得注意的是上述单卡训练可以通过加--parallel 1参数使用Parallel Executor来进行加速 - -gpu 单机多卡训练 -``` bash -CUDA_VISIBLE_DEVICES=0,1 python -u train.py --config_path 'data/config.txt' --train_dir 'data/paddle_train.txt' --batch_size 32 --epoch_num 100 --use_cuda 1 --parallel 1 --num_devices 2 > log.txt 2>&1 & -``` - -cpu 单机多卡训练 -``` bash -CPU_NUM=10 python -u train.py --config_path 'data/config.txt' --train_dir 'data/paddle_train.txt' --batch_size 32 --epoch_num 100 --use_cuda 0 --parallel 1 --num_devices 10 > log.txt 2>&1 & -``` - - -## 训练结果示例 - -我们在Tesla K40m单GPU卡上训练的日志如下所示(以实际输出为准) -```text -2019-02-22 09:31:51,578 - INFO - reading data begins -2019-02-22 09:32:22,407 - INFO - reading data completes -W0222 09:32:24.151955 7221 device_context.cc:263] Please NOTE: device: 0, CUDA Capability: 35, Driver API Version: 9.0, Runtime API Version: 8.0 -W0222 09:32:24.152046 7221 device_context.cc:271] device: 0, cuDNN Version: 7.0. -2019-02-22 09:32:27,797 - INFO - train begins -epoch: 1 global_step: 1000 train_loss: 0.6950 time: 14.64 -epoch: 1 global_step: 2000 train_loss: 0.6854 time: 15.41 -epoch: 1 global_step: 3000 train_loss: 0.6799 time: 14.84 -... -model saved in din_amazon/global_step_50000 -... -``` - -提示: - -* 在单机条件下,使用代码中默认的超参数运行时,产生最优auc的global step大致在440000到500000之间 - -* 训练超出一定的epoch后会稍稍出现过拟合 - -## 预测 -参考如下命令,开始预测. - -其中model_path为模型的路径,test_path为测试数据路径。 - -``` -CUDA_VISIBLE_DEVICES=3 python infer.py --model_path 'din_amazon/global_step_400000' --test_path 'data/paddle_test.txt' --use_cuda 1 -``` - -## 预测结果示例 -```text -2019-02-22 11:22:58,804 - INFO - TEST --> loss: [0.47005194] auc:0.863794952818 -``` diff --git a/PaddleRec/ctr/din/__init__.py b/PaddleRec/ctr/din/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/PaddleRec/ctr/din/_ce.py b/PaddleRec/ctr/din/_ce.py deleted file mode 100644 index e331d1bb..00000000 --- a/PaddleRec/ctr/din/_ce.py +++ /dev/null @@ -1,62 +0,0 @@ -# this file is only used for continuous evaluation test! - -import os -import sys -sys.path.append(os.environ['ceroot']) -from kpi import CostKpi -from kpi import DurationKpi - -each_pass_duration_card1_kpi = DurationKpi( - 'each_pass_duration_card1', 0.08, 0, actived=True) -train_loss_card1_kpi = CostKpi('train_loss_card1', 0.08, 0) -each_pass_duration_card4_kpi = DurationKpi( - 'each_pass_duration_card4', 0.08, 0, actived=True) -train_loss_card4_kpi = CostKpi('train_loss_card4', 0.08, 0) - -tracking_kpis = [ - each_pass_duration_card1_kpi, - train_loss_card1_kpi, - each_pass_duration_card4_kpi, - train_loss_card4_kpi, -] - - -def parse_log(log): - ''' - This method should be implemented by model developers. - - The suggestion: - - each line in the log should be key, value, for example: - - " - train_cost\t1.0 - test_cost\t1.0 - train_cost\t1.0 - train_cost\t1.0 - train_acc\t1.2 - " - ''' - for line in log.split('\n'): - fs = line.strip().split('\t') - print(fs) - if len(fs) == 3 and fs[0] == 'kpis': - kpi_name = fs[1] - kpi_value = float(fs[2]) - yield kpi_name, kpi_value - - -def log_to_ce(log): - kpi_tracker = {} - for kpi in tracking_kpis: - kpi_tracker[kpi.name] = kpi - - for (kpi_name, kpi_value) in parse_log(log): - print(kpi_name, kpi_value) - kpi_tracker[kpi_name].add_record(kpi_value) - kpi_tracker[kpi_name].persist() - - -if __name__ == '__main__': - log = sys.stdin.read() - log_to_ce(log) diff --git a/PaddleRec/ctr/din/data/build_dataset.py b/PaddleRec/ctr/din/data/build_dataset.py deleted file mode 100644 index 34c053cc..00000000 --- a/PaddleRec/ctr/din/data/build_dataset.py +++ /dev/null @@ -1,87 +0,0 @@ -from __future__ import print_function -import random -import pickle - -random.seed(1234) - -print("read and process data") - -with open('./raw_data/remap.pkl', 'rb') as f: - reviews_df = pickle.load(f) - cate_list = pickle.load(f) - user_count, item_count, cate_count, example_count = pickle.load(f) - -train_set = [] -test_set = [] -for reviewerID, hist in reviews_df.groupby('reviewerID'): - pos_list = hist['asin'].tolist() - - def gen_neg(): - neg = pos_list[0] - while neg in pos_list: - neg = random.randint(0, item_count - 1) - return neg - - neg_list = [gen_neg() for i in range(len(pos_list))] - - for i in range(1, len(pos_list)): - hist = pos_list[:i] - if i != len(pos_list) - 1: - train_set.append((reviewerID, hist, pos_list[i], 1)) - train_set.append((reviewerID, hist, neg_list[i], 0)) - else: - label = (pos_list[i], neg_list[i]) - test_set.append((reviewerID, hist, label)) - -random.shuffle(train_set) -random.shuffle(test_set) - -assert len(test_set) == user_count - - -def print_to_file(data, fout): - for i in range(len(data)): - fout.write(str(data[i])) - if i != len(data) - 1: - fout.write(' ') - else: - fout.write(';') - - -print("make train data") -with open("paddle_train.txt", "w") as fout: - for line in train_set: - history = line[1] - target = line[2] - label = line[3] - cate = [cate_list[x] for x in history] - print_to_file(history, fout) - print_to_file(cate, fout) - fout.write(str(target) + ";") - fout.write(str(cate_list[target]) + ";") - fout.write(str(label) + "\n") - -print("make test data") -with open("paddle_test.txt", "w") as fout: - for line in test_set: - history = line[1] - target = line[2] - cate = [cate_list[x] for x in history] - - print_to_file(history, fout) - print_to_file(cate, fout) - fout.write(str(target[0]) + ";") - fout.write(str(cate_list[target[0]]) + ";") - fout.write("1\n") - - print_to_file(history, fout) - print_to_file(cate, fout) - fout.write(str(target[1]) + ";") - fout.write(str(cate_list[target[1]]) + ";") - fout.write("0\n") - -print("make config data") -with open('config.txt', 'w') as f: - f.write(str(user_count) + "\n") - f.write(str(item_count) + "\n") - f.write(str(cate_count) + "\n") diff --git a/PaddleRec/ctr/din/data/convert_pd.py b/PaddleRec/ctr/din/data/convert_pd.py deleted file mode 100644 index d7927c7e..00000000 --- a/PaddleRec/ctr/din/data/convert_pd.py +++ /dev/null @@ -1,27 +0,0 @@ -from __future__ import print_function -import pickle -import pandas as pd - - -def to_df(file_path): - with open(file_path, 'r') as fin: - df = {} - i = 0 - for line in fin: - df[i] = eval(line) - i += 1 - df = pd.DataFrame.from_dict(df, orient='index') - return df - - -print("start to analyse reviews_Electronics_5.json") -reviews_df = to_df('./raw_data/reviews_Electronics_5.json') -with open('./raw_data/reviews.pkl', 'wb') as f: - pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL) - -print("start to analyse meta_Electronics.json") -meta_df = to_df('./raw_data/meta_Electronics.json') -meta_df = meta_df[meta_df['asin'].isin(reviews_df['asin'].unique())] -meta_df = meta_df.reset_index(drop=True) -with open('./raw_data/meta.pkl', 'wb') as f: - pickle.dump(meta_df, f, pickle.HIGHEST_PROTOCOL) diff --git a/PaddleRec/ctr/din/data/data_process.sh b/PaddleRec/ctr/din/data/data_process.sh deleted file mode 100644 index 7bcfc55f..00000000 --- a/PaddleRec/ctr/din/data/data_process.sh +++ /dev/null @@ -1,15 +0,0 @@ -#! /bin/bash - -set -e -echo "begin download data" -mkdir raw_data -cd raw_data -wget -c http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz -gzip -d reviews_Electronics_5.json.gz -wget -c http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Electronics.json.gz -gzip -d meta_Electronics.json.gz -echo "download data successfully" - -cd .. -python convert_pd.py -python remap_id.py diff --git a/PaddleRec/ctr/din/data/remap_id.py b/PaddleRec/ctr/din/data/remap_id.py deleted file mode 100644 index b110dac5..00000000 --- a/PaddleRec/ctr/din/data/remap_id.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import print_function -import random -import pickle -import numpy as np - -random.seed(1234) - -with open('./raw_data/reviews.pkl', 'rb') as f: - reviews_df = pickle.load(f) - reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']] -with open('./raw_data/meta.pkl', 'rb') as f: - meta_df = pickle.load(f) - meta_df = meta_df[['asin', 'categories']] - meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1]) - - -def build_map(df, col_name): - key = sorted(df[col_name].unique().tolist()) - m = dict(zip(key, range(len(key)))) - df[col_name] = df[col_name].map(lambda x: m[x]) - return m, key - - -asin_map, asin_key = build_map(meta_df, 'asin') -cate_map, cate_key = build_map(meta_df, 'categories') -revi_map, revi_key = build_map(reviews_df, 'reviewerID') - -user_count, item_count, cate_count, example_count =\ - len(revi_map), len(asin_map), len(cate_map), reviews_df.shape[0] -print('user_count: %d\titem_count: %d\tcate_count: %d\texample_count: %d' % - (user_count, item_count, cate_count, example_count)) - -meta_df = meta_df.sort_values('asin') -meta_df = meta_df.reset_index(drop=True) -reviews_df['asin'] = reviews_df['asin'].map(lambda x: asin_map[x]) -reviews_df = reviews_df.sort_values(['reviewerID', 'unixReviewTime']) -reviews_df = reviews_df.reset_index(drop=True) -reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']] - -cate_list = [meta_df['categories'][i] for i in range(len(asin_map))] -cate_list = np.array(cate_list, dtype=np.int32) - -with open('./raw_data/remap.pkl', 'wb') as f: - pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL) # uid, iid - pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL) # cid of iid line - pickle.dump((user_count, item_count, cate_count, example_count), f, - pickle.HIGHEST_PROTOCOL) - pickle.dump((asin_key, cate_key, revi_key), f, pickle.HIGHEST_PROTOCOL) diff --git a/PaddleRec/ctr/din/network.py b/PaddleRec/ctr/din/network.py deleted file mode 100644 index 2d5c9542..00000000 --- a/PaddleRec/ctr/din/network.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. - -import paddle.fluid as fluid - - -def din_attention(hist, target_expand, mask): - """activation weight""" - - hidden_size = hist.shape[-1] - - concat = fluid.layers.concat( - [hist, target_expand, hist - target_expand, hist * target_expand], - axis=2) - atten_fc1 = fluid.layers.fc(name="atten_fc1", - input=concat, - size=80, - act="sigmoid", - num_flatten_dims=2) - atten_fc2 = fluid.layers.fc(name="atten_fc2", - input=atten_fc1, - size=40, - act="sigmoid", - num_flatten_dims=2) - atten_fc3 = fluid.layers.fc(name="atten_fc3", - input=atten_fc2, - size=1, - num_flatten_dims=2) - atten_fc3 += mask - atten_fc3 = fluid.layers.transpose(x=atten_fc3, perm=[0, 2, 1]) - atten_fc3 = fluid.layers.scale(x=atten_fc3, scale=hidden_size**-0.5) - weight = fluid.layers.softmax(atten_fc3) - out = fluid.layers.matmul(weight, hist) - out = fluid.layers.reshape(x=out, shape=[0, hidden_size]) - return out - - -def network(item_count, cat_count): - """network definition""" - - seq_len = -1 - item_emb_size = 64 - cat_emb_size = 64 - is_sparse = False - #significant for speeding up the training process - - item_emb_attr = fluid.ParamAttr(name="item_emb") - cat_emb_attr = fluid.ParamAttr(name="cat_emb") - - hist_item_seq = fluid.data( - name="hist_item_seq", shape=[None, seq_len], dtype="int64") - hist_cat_seq = fluid.data( - name="hist_cat_seq", shape=[None, seq_len], dtype="int64") - target_item = fluid.data(name="target_item", shape=[None], dtype="int64") - target_cat = fluid.data(name="target_cat", shape=[None], dtype="int64") - label = fluid.data(name="label", shape=[None, 1], dtype="float32") - mask = fluid.data(name="mask", shape=[None, seq_len, 1], dtype="float32") - target_item_seq = fluid.data( - name="target_item_seq", shape=[None, seq_len], dtype="int64") - target_cat_seq = fluid.data( - name="target_cat_seq", shape=[None, seq_len], dtype="int64") - - hist_item_emb = fluid.embedding( - input=hist_item_seq, - size=[item_count, item_emb_size], - param_attr=item_emb_attr, - is_sparse=is_sparse) - - hist_cat_emb = fluid.embedding( - input=hist_cat_seq, - size=[cat_count, cat_emb_size], - param_attr=cat_emb_attr, - is_sparse=is_sparse) - - target_item_emb = fluid.embedding( - input=target_item, - size=[item_count, item_emb_size], - param_attr=item_emb_attr, - is_sparse=is_sparse) - - target_cat_emb = fluid.embedding( - input=target_cat, - size=[cat_count, cat_emb_size], - param_attr=cat_emb_attr, - is_sparse=is_sparse) - - target_item_seq_emb = fluid.embedding( - input=target_item_seq, - size=[item_count, item_emb_size], - param_attr=item_emb_attr, - is_sparse=is_sparse) - - target_cat_seq_emb = fluid.embedding( - input=target_cat_seq, - size=[cat_count, cat_emb_size], - param_attr=cat_emb_attr, - is_sparse=is_sparse) - - item_b = fluid.embedding( - input=target_item, - size=[item_count, 1], - param_attr=fluid.initializer.Constant(value=0.0)) - - hist_seq_concat = fluid.layers.concat([hist_item_emb, hist_cat_emb], axis=2) - target_seq_concat = fluid.layers.concat( - [target_item_seq_emb, target_cat_seq_emb], axis=2) - target_concat = fluid.layers.concat( - [target_item_emb, target_cat_emb], axis=1) - - out = din_attention(hist_seq_concat, target_seq_concat, mask) - out_fc = fluid.layers.fc(name="out_fc", - input=out, - size=item_emb_size + cat_emb_size, - num_flatten_dims=1) - embedding_concat = fluid.layers.concat([out_fc, target_concat], axis=1) - - fc1 = fluid.layers.fc(name="fc1", - input=embedding_concat, - size=80, - act="sigmoid") - fc2 = fluid.layers.fc(name="fc2", input=fc1, size=40, act="sigmoid") - fc3 = fluid.layers.fc(name="fc3", input=fc2, size=1) - logit = fc3 + item_b - - loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logit, label=label) - avg_loss = fluid.layers.mean(loss) - return avg_loss, fluid.layers.sigmoid(logit), \ - [hist_item_seq, hist_cat_seq, target_item, \ - target_cat, label, mask, target_item_seq, target_cat_seq] diff --git a/PaddleRec/ctr/din/reader.py b/PaddleRec/ctr/din/reader.py deleted file mode 100644 index 02122434..00000000 --- a/PaddleRec/ctr/din/reader.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. - -import os -import random -import numpy as np -import paddle -import pickle - - -def pad_batch_data(input, max_len): - res = np.array([x + [0] * (max_len - len(x)) for x in input]) - res = res.astype("int64").reshape([-1, max_len]) - return res - - -def make_data(b): - max_len = max(len(x[0]) for x in b) - item = pad_batch_data([x[0] for x in b], max_len) - cat = pad_batch_data([x[1] for x in b], max_len) - len_array = [len(x[0]) for x in b] - mask = np.array( - [[0] * x + [-1e9] * (max_len - x) for x in len_array]).reshape( - [-1, max_len, 1]) - target_item_seq = np.array( - [[x[2]] * max_len for x in b]).astype("int64").reshape([-1, max_len]) - target_cat_seq = np.array( - [[x[3]] * max_len for x in b]).astype("int64").reshape([-1, max_len]) - res = [] - for i in range(len(b)): - res.append([ - item[i], cat[i], b[i][2], b[i][3], b[i][4], mask[i], - target_item_seq[i], target_cat_seq[i] - ]) - return res - - -def batch_reader(reader, batch_size, group_size): - def batch_reader(): - bg = [] - for line in reader: - bg.append(line) - if len(bg) == group_size: - sortb = sorted(bg, key=lambda x: len(x[0]), reverse=False) - bg = [] - for i in range(0, group_size, batch_size): - b = sortb[i:i + batch_size] - yield make_data(b) - len_bg = len(bg) - if len_bg != 0: - sortb = sorted(bg, key=lambda x: len(x[0]), reverse=False) - bg = [] - remain = len_bg % batch_size - for i in range(0, len_bg - remain, batch_size): - b = sortb[i:i + batch_size] - yield make_data(b) - - return batch_reader - - -def base_read(file_dir): - res = [] - max_len = 0 - with open(file_dir, "r") as fin: - for line in fin: - line = line.strip().split(';') - hist = line[0].split() - cate = line[1].split() - max_len = max(max_len, len(hist)) - res.append([hist, cate, line[2], line[3], float(line[4])]) - return res, max_len - - -def prepare_reader(data_path, bs): - data_set, max_len = base_read(data_path) - random.shuffle(data_set) - return batch_reader(data_set, bs, bs * 20), max_len - - -def config_read(config_path): - with open(config_path, "r") as fin: - user_count = int(fin.readline().strip()) - item_count = int(fin.readline().strip()) - cat_count = int(fin.readline().strip()) - return user_count, item_count, cat_count diff --git a/PaddleRec/ctr/dnn/README.md b/PaddleRec/ctr/dnn/README.md index 989de5a0..6a1e7ed2 100644 --- a/PaddleRec/ctr/dnn/README.md +++ b/PaddleRec/ctr/dnn/README.md @@ -1,787 +1,8 @@ # 基于DNN模型的点击率预估模型 +models/PaddleRec只是提供了经典推荐算法的Paddle实现,我们已经开源了功能更强大的工具组件[PaddlePaddle/PaddleRec](https://github.com/PaddlePaddle/PaddleRec) 打通了推荐算法+分布式训练全流程,并提供了高级API,在单机和分布式间可以实现无缝切换。后续我们将在[PaddlePaddle/PaddleRec](https://github.com/PaddlePaddle/PaddleRec) Repo中发布新的模型和功能,models/PaddleRec不再更新维护。 + ## 介绍 `CTR(Click Through Rate)`,即点击率,是“推荐系统/计算广告”等领域的重要指标,对其进行预估是商品推送/广告投放等决策的基础。简单来说,CTR预估对每次广告的点击情况做出预测,预测用户是点击还是不点击。CTR预估模型综合考虑各种因素、特征,在大量历史数据上训练,最终对商业决策提供帮助。本模型实现了下述论文中提出的DNN模型: -```text -@inproceedings{guo2017deepfm, - title={DeepFM: A Factorization-Machine based Neural Network for CTR Prediction}, - author={Huifeng Guo, Ruiming Tang, Yunming Ye, Zhenguo Li and Xiuqiang He}, - booktitle={the Twenty-Sixth International Joint Conference on Artificial Intelligence (IJCAI)}, - pages={1725--1731}, - year={2017} -} -``` -# -## 目录 -* [运行环境](#运行环境) -* [数据准备](#数据准备) - * [数据来源](#数据来源) - * [数据预处理](#数据预处理) - * [一键下载训练及测试数据](#一键下载训练及测试数据) -* [模型组网](#模型组网) - * [数据输入声明](#数据输入声明) - * [CTR-DNN模型组网](#ctr-dnn模型组网) - * [Embedding层](#embedding层) - * [FC层](#fc层) - * [Loss及Auc计算](#loss及auc计算) -* [dataset数据读取](#dataset数据读取) - * [引入dataset](#引入dataset) - * [如何指定数据读取规则](#如何指定数据读取规则) - * [快速调试Dataset](#快速调试dataset) -* [单机训练 VS 分布式训练](#单机训练-vs-分布式训练) - * [区别一:数据需要分配到各个训练节点上](#区别一数据需要分配到各个训练节点上) - * [区别二:每个节点需要扮演不同的角色](#区别二每个节点需要扮演不同的角色) - * [共有的环境变量](#共有的环境变量) - * [Pserver特有的环境变量](#pserver特有的环境变量) - * [Trainer特有的环境变量](#trainer特有的环境变量) - * [区别三 分布式需要指定训练策略](#区别三-分布式需要指定训练策略) - * [区别四 分布式训练需要分别运行Pserver与Trainer](#区别四-分布式训练需要分别运行pserver与trainer) - * [区别五 启动训练](#区别五-启动训练) - * [运行单机训练](#运行单机训练) - * [运行分布式训练(本地模拟分布式)](#运行分布式训练本地模拟分布式) - * [区别六 保存模型](#区别六-保存模型) - * [单机训练中模型的保存](#单机训练中模型的保存) - * [分布式训练中模型的保存](#分布式训练中模型的保存) - * [区别七 增量训练](#区别七-增量训练) - * [单机增量训练](#单机增量训练) - * [分布式增量训练](#分布式增量训练) -* [单机离线预测](#单机离线预测) - * [构建预测网络及加载模型参数](#构建预测网络及加载模型参数) - * [测试数据的读取](#测试数据的读取) - * [AUC的清零步骤](#auc的清零步骤) - * [运行Infer](#运行infer) - * [benchmark](#benchmark) -* [启动分布式训练](#启动分布式训练) - * [训练代码准备](#训练代码准备) - * [运行环境准备](#运行环境准备) - * [启动server](#启动server) - * [启动worker](#启动worker) - -# -## 运行环境 -**示例训练代码仅支持在Linux环境下运行** -- Win/Mac 暂不支持dataset数据读取方式 -- Win/Mac 可以使用其他数据读取方式改写本示例代码并运行(参照`infer.py`) -- 目前仅支持Linux,如:`unbuntu`及`CentOS` -- 目前仅支持python版本`2.7` -- 请确保您的paddle版本高于`1.6.1`,可以利用pip升级您的paddle版本 -- 请确保您的本地模拟分布式运行环境中没有设置`http/https`代理,可以在终端键入`env`查看环境变量 - -# -## 数据准备 -### 数据来源 -训练及测试数据集选用[Display Advertising Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge/)所用的Criteo数据集。该数据集包括两部分:训练集和测试集。训练集包含一段时间内Criteo的部分流量,测试集则对应训练数据后一天的广告点击流量。 -每一行数据格式如下所示: -```bash -