From 1d50478e2d096d7a3025881a744174dc3c7b8b4e Mon Sep 17 00:00:00 2001 From: zhoushiyu <31816202+wilhelmzh@users.noreply.github.com> Date: Fri, 27 Dec 2019 15:58:27 +0800 Subject: [PATCH] DeepFM dygraph (#4091) * deepfm dygraph * deepfm dygraph fix readme * adapt dygraph dev api and fix readme * fix eval and readme * deepfm dygraph add infer.py, format train output * fix bug of last commit * add more log in train and infer. remove is_sparse --- PaddleRec/ctr/deepfm_dygraph/README.md | 73 +++++++++ .../data/aid_data/train_file_idx.txt | 1 + .../data/download_preprocess.py | 27 +++ .../ctr/deepfm_dygraph/data/preprocess.py | 120 ++++++++++++++ PaddleRec/ctr/deepfm_dygraph/data_reader.py | 75 +++++++++ PaddleRec/ctr/deepfm_dygraph/infer.py | 90 ++++++++++ PaddleRec/ctr/deepfm_dygraph/network.py | 125 ++++++++++++++ PaddleRec/ctr/deepfm_dygraph/train.py | 155 ++++++++++++++++++ PaddleRec/ctr/deepfm_dygraph/utility.py | 98 +++++++++++ 9 files changed, 764 insertions(+) create mode 100644 PaddleRec/ctr/deepfm_dygraph/README.md create mode 100644 PaddleRec/ctr/deepfm_dygraph/data/aid_data/train_file_idx.txt create mode 100644 PaddleRec/ctr/deepfm_dygraph/data/download_preprocess.py create mode 100644 PaddleRec/ctr/deepfm_dygraph/data/preprocess.py create mode 100644 PaddleRec/ctr/deepfm_dygraph/data_reader.py create mode 100644 PaddleRec/ctr/deepfm_dygraph/infer.py create mode 100644 PaddleRec/ctr/deepfm_dygraph/network.py create mode 100644 PaddleRec/ctr/deepfm_dygraph/train.py create mode 100644 PaddleRec/ctr/deepfm_dygraph/utility.py diff --git a/PaddleRec/ctr/deepfm_dygraph/README.md b/PaddleRec/ctr/deepfm_dygraph/README.md new file mode 100644 index 00000000..fc9b14aa --- /dev/null +++ b/PaddleRec/ctr/deepfm_dygraph/README.md @@ -0,0 +1,73 @@ + +# DeepFM动态图 + +以下是本例的简要目录结构及说明: + +```text +. +├── README.md # 文档 +├── train.py # 本地训练脚本 +├── infer.py # 本地预测脚本 +├── network.py # 网络结构 +├── data_reader.py # 读取数据相关的函数 +├── utility.py # 参数设置和通用函数 +├── data/ + ├── download_preprocess.py # 下载并预处理数据脚本 + ├── preprocess.py # 数据预处理脚本 + +``` + +## 介绍 +本模型使用PaddlePaddle **动态图** 复现了DeepFM模型。 + +DeepFM模型介绍可以参阅论文[DeepFM: A Factorization-Machine based Neural Network for CTR Prediction](https://arxiv.org/abs/1703.04247) + +## 环境 +- **目前本模型要求使用PaddlePaddle 1.7或最新develop版本** + +## 数据下载和预处理 + +我们在[Criteo](https://www.kaggle.com/c/criteo-display-ad-challenge/)数据集训练测试DeepFM。整个数据集包含约4500万条记录。每一行第一列是label,表示该条广告是否被点击,剩下的是13个整数型特征(I1 - I13)和26个离散型特征(C1 - C26)。 + +通过min-max normalize将连续特征转换到 [0, 1]区间,并去除离散型特征中出现少于10次的特征。整个数据集被划分为两部分:90%用来训练,10%用来评估模型效果。 + +下载并预处理数据命令: +```bash +cd data && python download_preprocess.py && cd .. +``` + +执行完命令后将生成三个文件夹: train_data, test_data和aid_data。 + +train_data包含90%数据,test_data包含剩下的10%数据,aid_data中有一个生成或下载(节约用户生成特征字典时间)的特征字典feat_dict_10.pkl2。 + +## 训练模型 + +```bash +CUDA_VISIBLE_DEVICES=0 python -u train.py > train.log 2>&1 & +``` + +每一轮数据训练结束后会测试模型效果。 + +加载已经存在的模型并继续训练: + +```bash +# 加载保存的epoch_0并继续训练 +CUDA_VISIBLE_DEVICES=0 python -u train.py --checkpoint=models/epoch_0 > train.log 2>&1 & +``` + +## 预测模型 + +```bash +CUDA_VISIBLE_DEVICES=0 python infer.py --checkpoint=models/epoch_0 +``` + +加载models/epoch_0的模型,对test_data中数据进行预测,评估模型效果。注意:最后一行才是整个test数据集的auc。 + +## 效果 +```text +test auc of epoch 0 is 0.802877 +``` + +第一轮数据训练结束后,test auc为0.802877。 + +继续训练模型易出现过拟合现象,可以通过评估模型选择效果最好的模型作为最终训练结果。 diff --git a/PaddleRec/ctr/deepfm_dygraph/data/aid_data/train_file_idx.txt b/PaddleRec/ctr/deepfm_dygraph/data/aid_data/train_file_idx.txt new file mode 100644 index 00000000..680c603c --- /dev/null +++ b/PaddleRec/ctr/deepfm_dygraph/data/aid_data/train_file_idx.txt @@ -0,0 +1 @@ +[156, 51, 24, 103, 195, 35, 188, 16, 224, 173, 116, 3, 226, 11, 64, 94, 6, 70, 197, 164, 220, 77, 172, 194, 227, 12, 65, 129, 39, 38, 75, 210, 215, 36, 46, 185, 76, 222, 108, 78, 120, 71, 33, 189, 135, 97, 90, 219, 105, 205, 136, 167, 106, 29, 157, 125, 217, 121, 175, 143, 200, 45, 179, 37, 86, 140, 225, 47, 20, 228, 4, 209, 177, 178, 171, 58, 48, 118, 9, 149, 55, 192, 82, 17, 43, 54, 93, 96, 159, 216, 18, 206, 223, 104, 132, 182, 60, 109, 28, 180, 44, 166, 128, 27, 163, 141, 229, 102, 150, 7, 83, 198, 41, 191, 114, 117, 122, 161, 130, 174, 176, 160, 201, 49, 112, 69, 165, 95, 133, 92, 59, 110, 151, 203, 67, 169, 21, 66, 80, 22, 23, 152, 40, 127, 111, 186, 72, 26, 190, 42, 0, 63, 53, 124, 137, 85, 126, 196, 187, 208, 98, 25, 15, 170, 193, 168, 202, 31, 146, 147, 113, 32, 204, 131, 68, 84, 213, 19, 81, 79, 162, 199, 107, 50, 2, 207, 10, 181, 144, 139, 134, 62, 155, 142, 214, 212, 61, 52, 101, 99, 158, 145, 13, 153, 56, 184, 221] \ No newline at end of file diff --git a/PaddleRec/ctr/deepfm_dygraph/data/download_preprocess.py b/PaddleRec/ctr/deepfm_dygraph/data/download_preprocess.py new file mode 100644 index 00000000..a193e2fc --- /dev/null +++ b/PaddleRec/ctr/deepfm_dygraph/data/download_preprocess.py @@ -0,0 +1,27 @@ +import os +import shutil +import sys + +LOCAL_PATH = os.path.dirname(os.path.abspath(__file__)) +TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools") +sys.path.append(TOOLS_PATH) + +from tools import download_file_and_uncompress, download_file + +if __name__ == '__main__': + url = "https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz" + url2 = "https://paddlerec.bj.bcebos.com/deepfm%2Ffeat_dict_10.pkl2" + + print("download and extract starting...") + download_file_and_uncompress(url) + if not os.path.exists("aid_data"): + os.makedirs("aid_data") + download_file(url2, "./aid_data/feat_dict_10.pkl2", True) + print("download and extract finished") + + print("preprocessing...") + os.system("python preprocess.py") + print("preprocess done") + + shutil.rmtree("raw_data") + print("done") diff --git a/PaddleRec/ctr/deepfm_dygraph/data/preprocess.py b/PaddleRec/ctr/deepfm_dygraph/data/preprocess.py new file mode 100644 index 00000000..b98141ea --- /dev/null +++ b/PaddleRec/ctr/deepfm_dygraph/data/preprocess.py @@ -0,0 +1,120 @@ +from __future__ import division +import os +import numpy +from collections import Counter +import shutil +import pickle + + +def get_raw_data(intput_file, raw_data, ins_per_file): + if not os.path.isdir(raw_data): + os.mkdir(raw_data) + + fin = open(intput_file, 'r') + fout = open(os.path.join(raw_data, 'part-0'), 'w') + for line_idx, line in enumerate(fin): + if line_idx % ins_per_file == 0 and line_idx != 0: + fout.close() + cur_part_idx = int(line_idx / ins_per_file) + fout = open( + os.path.join(raw_data, 'part-' + str(cur_part_idx)), 'w') + fout.write(line) + fout.close() + fin.close() + + +def split_data(raw_data, aid_data, train_data, test_data): + split_rate_ = 0.9 + dir_train_file_idx_ = os.path.join(aid_data, 'train_file_idx.txt') + filelist_ = [ + os.path.join(raw_data, 'part-%d' % x) + for x in range(len(os.listdir(raw_data))) + ] + + if not os.path.exists(dir_train_file_idx_): + train_file_idx = list( + numpy.random.choice( + len(filelist_), int(len(filelist_) * split_rate_), False)) + with open(dir_train_file_idx_, 'w') as fout: + fout.write(str(train_file_idx)) + else: + with open(dir_train_file_idx_, 'r') as fin: + train_file_idx = eval(fin.read()) + + for idx in range(len(filelist_)): + if idx in train_file_idx: + shutil.move(filelist_[idx], train_data) + else: + shutil.move(filelist_[idx], test_data) + + +def get_feat_dict(intput_file, aid_data, print_freq=100000, total_ins=45000000): + freq_ = 10 + dir_feat_dict_ = os.path.join(aid_data, 'feat_dict_' + str(freq_) + '.pkl2') + continuous_range_ = range(1, 14) + categorical_range_ = range(14, 40) + + if not os.path.exists(dir_feat_dict_): + # print('generate a feature dict') + # Count the number of occurrences of discrete features + feat_cnt = Counter() + with open(intput_file, 'r') as fin: + for line_idx, line in enumerate(fin): + if line_idx % print_freq == 0: + print(r'generating feature dict {:.2f} %'.format(( + line_idx / total_ins) * 100)) + features = line.rstrip('\n').split('\t') + for idx in categorical_range_: + if features[idx] == '': continue + feat_cnt.update([features[idx]]) + + # Only retain discrete features with high frequency + dis_feat_set = set() + for feat, ot in feat_cnt.items(): + if ot >= freq_: + dis_feat_set.add(feat) + + # Create a dictionary for continuous and discrete features + feat_dict = {} + tc = 1 + # Continuous features + for idx in continuous_range_: + feat_dict[idx] = tc + tc += 1 + for feat in dis_feat_set: + feat_dict[feat] = tc + tc += 1 + # Save dictionary + with open(dir_feat_dict_, 'wb') as fout: + pickle.dump(feat_dict, fout, protocol=2) + print('args.num_feat ', len(feat_dict) + 1) + + +def preprocess(input_file, + outdir, + ins_per_file, + total_ins=None, + print_freq=None): + train_data = os.path.join(outdir, "train_data") + test_data = os.path.join(outdir, "test_data") + aid_data = os.path.join(outdir, "aid_data") + raw_data = os.path.join(outdir, "raw_data") + if not os.path.isdir(train_data): + os.mkdir(train_data) + if not os.path.isdir(test_data): + os.mkdir(test_data) + if not os.path.isdir(aid_data): + os.mkdir(aid_data) + + if print_freq is None: + print_freq = 10 * ins_per_file + + get_raw_data(input_file, raw_data, ins_per_file) + split_data(raw_data, aid_data, train_data, test_data) + get_feat_dict(input_file, aid_data, print_freq, total_ins) + + print('Done!') + + +if __name__ == '__main__': + preprocess('train.txt', './', 200000, 45000000) diff --git a/PaddleRec/ctr/deepfm_dygraph/data_reader.py b/PaddleRec/ctr/deepfm_dygraph/data_reader.py new file mode 100644 index 00000000..7c9d9abc --- /dev/null +++ b/PaddleRec/ctr/deepfm_dygraph/data_reader.py @@ -0,0 +1,75 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import pickle +import random + +import paddle + + +class DataGenerator(object): + def __init__(self, feat_dict_path): + # min-max of continuous features in Criteo dataset + self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + self.cont_max_ = [ + 5775, 257675, 65535, 969, 23159456, 431037, 56311, 6047, 29019, 46, + 231, 4008, 7393 + ] + self.cont_diff_ = [ + self.cont_max_[i] - self.cont_min_[i] + for i in range(len(self.cont_min_)) + ] + self.continuous_range_ = range(1, 14) + self.categorical_range_ = range(14, 40) + self.feat_dict_ = pickle.load(open(feat_dict_path, 'rb')) + + def _process_line(self, line): + features = line.rstrip('\n').split('\t') + feat_idx = [] + feat_value = [] + for idx in self.continuous_range_: + if features[idx] == '': + feat_idx.append(0) + feat_value.append(0.0) + else: + feat_idx.append(self.feat_dict_[idx]) + feat_value.append( + (float(features[idx]) - self.cont_min_[idx - 1]) / + self.cont_diff_[idx - 1]) + for idx in self.categorical_range_: + if features[idx] == '' or features[idx] not in self.feat_dict_: + feat_idx.append(0) + feat_value.append(0.0) + else: + feat_idx.append(self.feat_dict_[features[idx]]) + feat_value.append(1.0) + label = [int(features[0])] + return feat_idx, feat_value, label + + def train_reader(self, file_list, batch_size, cycle, shuffle=True): + def _reader(): + if shuffle: + random.shuffle(file_list) + while True: + for fn in file_list: + for line in open(fn, 'r'): + yield self._process_line(line) + if not cycle: + break + + return paddle.batch(_reader, batch_size=batch_size) + + +def data_reader(batch_size, + file_list, + feat_dict_path, + cycle=False, + shuffle=False, + data_type="train"): + generator = DataGenerator(feat_dict_path) + + if data_type != "train" and data_type != "test": + print("data type only support train | test") + raise Exception("data type only support train | test") + return generator.train_reader(file_list, batch_size, cycle, shuffle=shuffle) diff --git a/PaddleRec/ctr/deepfm_dygraph/infer.py b/PaddleRec/ctr/deepfm_dygraph/infer.py new file mode 100644 index 00000000..7feec87a --- /dev/null +++ b/PaddleRec/ctr/deepfm_dygraph/infer.py @@ -0,0 +1,90 @@ +from __future__ import print_function + +import os + +import numpy as np +import paddle.fluid as fluid +from paddle.fluid.dygraph.base import to_variable +import logging +import time + +import data_reader +import utility as utils +from network import DeepFM + +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO) +logger = logging.getLogger(__name__) + + +def infer(args): + if args.use_gpu: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + with fluid.dygraph.guard(place): + deepfm = DeepFM(args) + + test_filelist = [ + os.path.join(args.test_data_dir, x) + for x in os.listdir(args.test_data_dir) + ] + + test_reader = data_reader.data_reader( + args.batch_size, test_filelist, args.feat_dict, data_type="test") + + # load model + if args.checkpoint: + model_dict, optimizer_dict = fluid.dygraph.load_dygraph( + args.checkpoint) + deepfm.set_dict(model_dict) + logger.info("load model {} finished.".format(args.checkpoint)) + else: + logger.error("no model to load!") + logger.error("please set model to load in --checkpoint first.") + exit(1) + + def eval(): + deepfm.eval() + logger.info("start eval model.") + total_step = 0 + batch_begin = time.time() + auc_metric_test = fluid.metrics.Auc("ROC") + for data in test_reader(): + total_step += 1 + raw_feat_idx, raw_feat_value, label = zip(*data) + raw_feat_idx = np.array(raw_feat_idx, dtype=np.int64) + raw_feat_value = np.array(raw_feat_value, dtype=np.float32) + label = np.array(label, dtype=np.int64) + raw_feat_idx, raw_feat_value, label = [ + to_variable(i) + for i in [raw_feat_idx, raw_feat_value, label] + ] + + predict = deepfm(raw_feat_idx, raw_feat_value, label) + + # for auc + predict_2d = fluid.layers.concat([1 - predict, predict], 1) + auc_metric_test.update( + preds=predict_2d.numpy(), labels=label.numpy()) + + if total_step > 0 and total_step % 100 == 0: + logger.info( + "TEST --> batch: {} auc: {:.6f} speed: {:.2f} ins/s". + format(total_step, + auc_metric_test.eval(), 100 * args.batch_size / ( + time.time() - batch_begin))) + batch_begin = time.time() + + logger.info("test auc is %.6f" % auc_metric_test.eval()) + + begin = time.time() + eval() + logger.info("test finished, cost %f s" % (time.time() - begin)) + + +if __name__ == '__main__': + args = utils.parse_args() + utils.print_arguments(args) + + infer(args) diff --git a/PaddleRec/ctr/deepfm_dygraph/network.py b/PaddleRec/ctr/deepfm_dygraph/network.py new file mode 100644 index 00000000..79d24f06 --- /dev/null +++ b/PaddleRec/ctr/deepfm_dygraph/network.py @@ -0,0 +1,125 @@ +import math + +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import FC, Embedding + + +class DeepFM(fluid.dygraph.Layer): + def __init__(self, args): + super(DeepFM, self).__init__() + self.args = args + self.init_value_ = 0.1 + + self.fm = FM(args) + self.dnn = DNN(args) + + def forward(self, raw_feat_idx, raw_feat_value, label): + feat_idx = fluid.layers.reshape(raw_feat_idx, + [-1, 1]) # (None * num_field) * 1 + feat_value = fluid.layers.reshape( + raw_feat_value, + [-1, self.args.num_field, 1]) # None * num_field * 1 + + y_first_order, y_second_order, feat_embeddings = self.fm(feat_idx, + feat_value) + y_dnn = self.dnn(feat_embeddings) + + predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn) + + return predict + + +class FM(fluid.dygraph.Layer): + def __init__(self, args): + super(FM, self).__init__() + self.args = args + self.init_value_ = 0.1 + self.embedding_w = Embedding( + size=[self.args.num_feat + 1, 1], + dtype='float32', + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormalInitializer( + loc=0.0, scale=self.init_value_), + regularizer=fluid.regularizer.L1DecayRegularizer( + self.args.reg))) + self.embedding = Embedding( + size=[self.args.num_feat + 1, self.args.embedding_size], + dtype='float32', + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormalInitializer( + loc=0.0, + scale=self.init_value_ / + math.sqrt(float(self.args.embedding_size))))) + + def forward(self, feat_idx, feat_value): + # -------------------- first order term -------------------- + first_weights_re = self.embedding_w(feat_idx) + first_weights = fluid.layers.reshape( + first_weights_re, + shape=[-1, self.args.num_field, 1]) # None * num_field * 1 + y_first_order = fluid.layers.reduce_sum(first_weights * feat_value, 1) + + # -------------------- second order term -------------------- + feat_embeddings_re = self.embedding(feat_idx) + feat_embeddings = fluid.layers.reshape( + feat_embeddings_re, + shape=[-1, self.args.num_field, self.args.embedding_size + ]) # None * num_field * embedding_size + feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size + + # sum_square part + summed_features_emb = fluid.layers.reduce_sum( + feat_embeddings, 1) # None * embedding_size + summed_features_emb_square = fluid.layers.square( + summed_features_emb) # None * embedding_size + + # square_sum part + squared_features_emb = fluid.layers.square( + feat_embeddings) # None * num_field * embedding_size + squared_sum_features_emb = fluid.layers.reduce_sum( + squared_features_emb, 1) # None * embedding_size + + y_second_order = 0.5 * fluid.layers.reduce_sum( + summed_features_emb_square - squared_sum_features_emb, + 1, + keep_dim=True) # None * 1 + + return y_first_order, y_second_order, feat_embeddings + + +class DNN(fluid.dygraph.Layer): + def __init__(self, args): + super(DNN, self).__init__() + self.args = args + self.init_value_ = 0.1 + sizes = self.args.layer_sizes + [1] + acts = [self.args.act + for _ in range(len(self.args.layer_sizes))] + [None] + w_scales = [ + self.init_value_ / math.sqrt(float(10)) + for _ in range(len(self.args.layer_sizes)) + ] + [self.init_value_] + self.fcs = [] + for i in range(len(self.args.layer_sizes) + 1): + fc = FC( + self.full_name(), + size=sizes[i], + act=acts[i], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormalInitializer( + loc=0.0, scale=w_scales[i])), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormalInitializer( + loc=0.0, scale=self.init_value_))) + self.add_sublayer('fc_%d' % i, fc) + self.fcs.append(fc) + + def forward(self, feat_embeddings): + y_dnn = fluid.layers.reshape( + feat_embeddings, + [-1, self.args.num_field * self.args.embedding_size]) + for fc in self.fcs: + y_dnn = fc(y_dnn) + return y_dnn diff --git a/PaddleRec/ctr/deepfm_dygraph/train.py b/PaddleRec/ctr/deepfm_dygraph/train.py new file mode 100644 index 00000000..19774e97 --- /dev/null +++ b/PaddleRec/ctr/deepfm_dygraph/train.py @@ -0,0 +1,155 @@ +from __future__ import print_function + +import os + +import numpy as np +import paddle.fluid as fluid +import time +from paddle.fluid.dygraph.base import to_variable +import logging + +import data_reader +import utility as utils +from network import DeepFM + +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO) +logger = logging.getLogger(__name__) + + +def train(args): + if args.use_gpu: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + with fluid.dygraph.guard(place): + deepfm = DeepFM(args) + + train_filelist = [ + os.path.join(args.train_data_dir, x) + for x in os.listdir(args.train_data_dir) + ] + test_filelist = [ + os.path.join(args.test_data_dir, x) + for x in os.listdir(args.test_data_dir) + ] + + train_reader = data_reader.data_reader( + args.batch_size, train_filelist, args.feat_dict, data_type="train") + test_reader = data_reader.data_reader( + args.batch_size, test_filelist, args.feat_dict, data_type="test") + + def eval(epoch): + deepfm.eval() + logger.info("start eval model.") + total_step = 0.0 + auc_metric_test = fluid.metrics.Auc("ROC") + for data in test_reader(): + total_step += 1 + raw_feat_idx, raw_feat_value, label = zip(*data) + raw_feat_idx = np.array(raw_feat_idx, dtype=np.int64) + raw_feat_value = np.array(raw_feat_value, dtype=np.float32) + label = np.array(label, dtype=np.int64) + raw_feat_idx, raw_feat_value, label = [ + to_variable(i) + for i in [raw_feat_idx, raw_feat_value, label] + ] + + predict = deepfm(raw_feat_idx, raw_feat_value, label) + + # for auc + predict_2d = fluid.layers.concat([1 - predict, predict], 1) + auc_metric_test.update( + preds=predict_2d.numpy(), labels=label.numpy()) + + logger.info("test auc of epoch %d is %.6f" % + (epoch, auc_metric_test.eval())) + + optimizer = fluid.optimizer.Adam( + regularization=fluid.regularizer.L2DecayRegularizer(args.reg)) + + # load model if exists + start_epoch = 0 + if args.checkpoint: + model_dict, optimizer_dict = fluid.dygraph.load_dygraph( + args.checkpoint) + deepfm.set_dict(model_dict) + optimizer.set_dict(optimizer_dict) + start_epoch = int( + os.path.basename(args.checkpoint).split("_")[ + -1]) + 1 # get next train epoch + logger.info("load model {} finished.".format(args.checkpoint)) + + for epoch in range(start_epoch, args.num_epoch): + begin = time.time() + batch_begin = time.time() + batch_id = 0 + total_loss = 0.0 + auc_metric = fluid.metrics.Auc("ROC") + logger.info("training epoch {} start.".format(epoch)) + + for data in train_reader(): + raw_feat_idx, raw_feat_value, label = zip(*data) + raw_feat_idx = np.array(raw_feat_idx, dtype=np.int64) + raw_feat_value = np.array(raw_feat_value, dtype=np.float32) + label = np.array(label, dtype=np.int64) + raw_feat_idx, raw_feat_value, label = [ + to_variable(i) + for i in [raw_feat_idx, raw_feat_value, label] + ] + + predict = deepfm(raw_feat_idx, raw_feat_value, label) + + loss = fluid.layers.log_loss( + input=predict, + label=fluid.layers.cast( + label, dtype="float32")) + batch_loss = fluid.layers.reduce_sum(loss) + + total_loss += batch_loss.numpy().item() + + batch_loss.backward() + optimizer.minimize(batch_loss) + deepfm.clear_gradients() + + # for auc + predict_2d = fluid.layers.concat([1 - predict, predict], 1) + auc_metric.update( + preds=predict_2d.numpy(), labels=label.numpy()) + + if batch_id > 0 and batch_id % 100 == 0: + logger.info( + "epoch: {}, batch_id: {}, loss: {:.6f}, auc: {:.6f}, speed: {:.2f} ins/s". + format(epoch, batch_id, total_loss / args.batch_size / + 100, + auc_metric.eval(), 100 * args.batch_size / ( + time.time() - batch_begin))) + batch_begin = time.time() + total_loss = 0.0 + + batch_id += 1 + logger.info("epoch %d is finished and takes %f s" % + (epoch, time.time() - begin)) + # save model and optimizer + logger.info("going to save epoch {} model and optimizer.".format( + epoch)) + fluid.dygraph.save_dygraph( + deepfm.state_dict(), + model_path=os.path.join(args.model_output_dir, + "epoch_" + str(epoch))) + fluid.dygraph.save_dygraph( + optimizer.state_dict(), + model_path=os.path.join(args.model_output_dir, + "epoch_" + str(epoch))) + logger.info("save epoch {} finished.".format(epoch)) + # eval model + deepfm.eval() + eval(epoch) + deepfm.train() + + +if __name__ == '__main__': + args = utils.parse_args() + utils.print_arguments(args) + + train(args) diff --git a/PaddleRec/ctr/deepfm_dygraph/utility.py b/PaddleRec/ctr/deepfm_dygraph/utility.py new file mode 100644 index 00000000..31a80bbb --- /dev/null +++ b/PaddleRec/ctr/deepfm_dygraph/utility.py @@ -0,0 +1,98 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import distutils.util + +import numpy as np +import six + + +def parse_args(): + parser = argparse.ArgumentParser(description="deepfm dygraph") + parser.add_argument( + '--train_data_dir', + type=str, + default='data/train_data', + help='The path of train data (default: data/train_data)') + parser.add_argument( + '--test_data_dir', + type=str, + default='data/test_data', + help='The path of test data (default: models)') + parser.add_argument( + '--model_output_dir', + type=str, + default='models', + help='The path for model to store (default: models)') + parser.add_argument( + '--checkpoint', + type=str, + default='', + help='The path for model and optimizer to load (default: "")') + parser.add_argument( + '--feat_dict', + type=str, + default='data/aid_data/feat_dict_10.pkl2', + help='The path of feat_dict') + parser.add_argument( + '--num_epoch', + type=int, + default=10, + help="The number of epochs to train (default: 10)") + parser.add_argument( + '--batch_size', + type=int, + default=4096, + help="The size of mini-batch (default:4096)") + parser.add_argument( + '--use_gpu', type=distutils.util.strtobool, default=True) + + parser.add_argument( + '--embedding_size', + type=int, + default=10, + help="The size for embedding layer (default:10)") + parser.add_argument( + '--layer_sizes', + nargs='+', + type=int, + default=[400, 400, 400], + help='The size of each layers (default: [400, 400, 400])') + parser.add_argument( + '--act', + type=str, + default='relu', + help='The activation of each layers (default: relu)') + parser.add_argument( + '--lr', type=float, default=1e-3, help='Learning rate (default: 1e-3)') + parser.add_argument( + '--reg', type=float, default=1e-4, help=' (default: 1e-4)') + parser.add_argument('--num_field', type=int, default=39) + parser.add_argument('--num_feat', type=int, default=1086460) # 2090493 + + return parser.parse_args() + + +def print_arguments(args): + """Print argparse's arguments. + Usage: + .. code-block:: python + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + print("----------- Configuration Arguments -----------") + for arg, value in sorted(six.iteritems(vars(args))): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + +def to_numpy(data): + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + return flattened_data -- GitLab