提交 425b3399 编写于 作者: C CandyCaneLane

new file: xdeepfm/README.md

	new file:   xdeepfm/args.py
	new file:   xdeepfm/criteo_reader.py
	new file:   xdeepfm/data/test_data/ev
	new file:   xdeepfm/data/train_data/tr
	new file:   xdeepfm/infer.py
	new file:   xdeepfm/local_train.py
	new file:   xdeepfm/network_conf.py
上级 1ba79a58
# xDeepFM for CTR Prediction
## Introduction
Reproduce [the open source code](https://github.com/Leavingseason/xDeepFM) of the paper "xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems" with PaddlePaddle on demo data.
## Environment
- PaddlePaddle 1.5
## Train
```bash
python local_train.py --model_output_dir models
```
## Infer
```bash
python infer.py --model_output_dir models --test_epoch 1
```
Note: The last log info is the total Logloss and AUC for all test data.
## Result
When the training set is iterated to the 10th round, the testing Logloss is `0.48657` and the testing AUC is `0.7308`.
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
parser.add_argument(
'--train_data_dir',
type=str,
default='data/train_data',
help='The path of train data (default: data/train_data)')
parser.add_argument(
'--test_data_dir',
type=str,
default='data/test_data',
help='The path of test data (default: models)')
parser.add_argument(
'--batch_size',
type=int,
default=100,
help="The size of mini-batch (default:100)")
parser.add_argument(
'--embedding_size',
type=int,
default=10,
help="The size for embedding layer (default:10)")
parser.add_argument(
'--num_epoch',
type=int,
default=10,
help="The number of epochs to train (default: 10)")
parser.add_argument(
'--model_output_dir',
type=str,
required=True,
help='The path for model to store (default: models)')
parser.add_argument(
'--num_thread',
type=int,
default=1,
help='The number of threads (default: 10)')
parser.add_argument('--test_epoch', type=str, default='1')
parser.add_argument(
'--layer_sizes_dnn',
nargs='+',
type=int,
default=[10, 10, 10],
help='The size of each layers')
parser.add_argument(
'--layer_sizes_cin',
nargs='+',
type=int,
default=[10, 10],
help='The size of each layers')
parser.add_argument(
'--act',
type=str,
default='relu',
help='The activation of each layers (default: relu)')
parser.add_argument(
'--lr', type=float, default=1e-1, help='Learning rate (default: 1e-4)')
parser.add_argument(
'--reg', type=float, default=1e-4, help=' (default: 1e-4)')
parser.add_argument('--num_field', type=int, default=39)
parser.add_argument('--num_feat', type=int, default=28651)
parser.add_argument(
'--model_name',
type=str,
default='ctr_xdeepfm_model',
help='The name of model (default: ctr_xdeepfm_model)')
parser.add_argument('--use_gpu', type=int, default=1)
return parser.parse_args()
import sys
import paddle.fluid.incubate.data_generator as dg
import pickle
from collections import Counter
import os
class CriteoDataset(dg.MultiSlotDataGenerator):
def _process_line(self, line):
features = line.strip('\n').split('\t')
feat_idx = []
feat_value = []
for idx in range(1, 40):
feat_idx.append(int(features[idx]))
feat_value.append(1.0)
label = [int(features[0])]
return feat_idx, feat_value, label
def test(self, filelist):
def local_iter():
for fname in filelist:
with open(fname.strip(), 'r') as fin:
for line in fin:
feat_idx, feat_value, label = self._process_line(line)
yield [feat_idx, feat_value, label]
return local_iter
def generate_sample(self, line):
def data_iter():
feat_idx, feat_value, label = self._process_line(line)
feature_name = ['feat_idx', 'feat_value', 'label']
yield [('feat_idx', feat_idx), ('feat_value', feat_value), ('label',
label)]
return data_iter
if __name__ == '__main__':
criteo_dataset = CriteoDataset()
criteo_dataset.run_from_stdin()
此差异已折叠。
此差异已折叠。
import logging
import numpy as np
import pickle
import os
import paddle
import paddle.fluid as fluid
from args import parse_args
from criteo_reader import CriteoDataset
import network_conf
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('fluid')
logger.setLevel(logging.INFO)
def infer():
args = parse_args()
if args.use_gpu == 1:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
inference_scope = fluid.Scope()
test_files = [
args.test_data_dir + '/' + x for x in os.listdir(args.test_data_dir)
]
criteo_dataset = CriteoDataset()
test_reader = paddle.batch(
criteo_dataset.test(test_files), batch_size=args.batch_size)
startup_program = fluid.framework.Program()
test_program = fluid.framework.Program()
cur_model_path = args.model_output_dir + '/epoch_' + args.test_epoch
with fluid.scope_guard(inference_scope):
with fluid.framework.program_guard(test_program, startup_program):
loss, auc, data_list = eval('network_conf.' + args.model_name)(
args.embedding_size, args.num_field, args.num_feat,
args.layer_sizes_dnn, args.act, args.reg, args.layer_sizes_cin)
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=data_list, place=place)
fluid.io.load_persistables(
executor=exe,
dirname=cur_model_path,
main_program=fluid.default_main_program())
auc_states_names = ['_generated_var_2', '_generated_var_3']
for name in auc_states_names:
param = inference_scope.var(name).get_tensor()
param_array = np.zeros(param._get_dims()).astype("int64")
param.set(param_array, place)
loss_all = 0
num_ins = 0
for batch_id, data_test in enumerate(test_reader()):
loss_val, auc_val = exe.run(test_program,
feed=feeder.feed(data_test),
fetch_list=[loss.name, auc.name])
num_ins += len(data_test)
loss_all += loss_val * len(data_test)
logger.info('TEST --> batch: {} loss: {} auc_val: {}'.format(
batch_id, loss_all / num_ins, auc_val))
print(
'The last log info is the total Logloss and AUC for all test data. '
)
if __name__ == '__main__':
infer()
from args import parse_args
import os
import paddle.fluid as fluid
import sys
import network_conf
import time
def train():
args = parse_args()
if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir)
loss, auc, data_list = eval('network_conf.' + args.model_name)(
args.embedding_size, args.num_field, args.num_feat,
args.layer_sizes_dnn, args.act, args.reg, args.layer_sizes_cin)
optimizer = fluid.optimizer.SGD(
learning_rate=args.lr,
regularization=fluid.regularizer.L2DecayRegularizer(args.reg))
optimizer.minimize(loss)
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(data_list)
dataset.set_pipe_command('python criteo_reader.py')
dataset.set_batch_size(args.batch_size)
dataset.set_filelist([
args.train_data_dir + '/' + x for x in os.listdir(args.train_data_dir)
])
if args.use_gpu == 1:
exe = fluid.Executor(fluid.CUDAPlace(0))
dataset.set_thread(1)
else:
exe = fluid.Executor(fluid.CPUPlace())
dataset.set_thread(args.num_thread)
exe.run(fluid.default_startup_program())
for epoch_id in range(args.num_epoch):
start = time.time()
exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=[loss, auc],
fetch_info=['loss', 'auc'],
debug=False,
print_period=1000)
model_dir = args.model_output_dir + '/epoch_' + str(epoch_id + 1)
sys.stderr.write('epoch%d is finished and takes %f s\n' % (
(epoch_id + 1), time.time() - start))
fluid.io.save_persistables(
executor=exe,
dirname=model_dir,
main_program=fluid.default_main_program())
if __name__ == '__main__':
train()
import paddle.fluid as fluid
import math
def ctr_xdeepfm_model(embedding_size, num_field, num_feat, layer_sizes_dnn, act,
reg, layer_sizes_cin):
init_value_ = 0.1
initer = fluid.initializer.TruncatedNormalInitializer(
loc=0.0, scale=init_value_)
raw_feat_idx = fluid.layers.data(
name='feat_idx', shape=[num_field], dtype='int64')
raw_feat_value = fluid.layers.data(
name='feat_value', shape=[num_field], dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='float32') # None * 1
feat_idx = fluid.layers.reshape(raw_feat_idx,
[-1, num_field, 1]) # None * num_field * 1
feat_value = fluid.layers.reshape(
raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
feat_embeddings = fluid.layers.embedding(
input=feat_idx,
dtype='float32',
size=[num_feat + 1, embedding_size],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=initer)) # None * num_field * embedding_size
feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size
# -------------------- linear --------------------
weights_linear = fluid.layers.embedding(
input=feat_idx,
dtype='float32',
size=[num_feat + 1, 1],
padding_idx=0,
param_attr=fluid.ParamAttr(initializer=initer)) # None * num_field * 1
b_linear = fluid.layers.create_parameter(
shape=[1],
dtype='float32',
default_initializer=fluid.initializer.ConstantInitializer(value=0))
y_linear = fluid.layers.reduce_sum(
(weights_linear * feat_value), 1) + b_linear
# -------------------- CIN --------------------
Xs = [feat_embeddings]
last_s = num_field
for s in layer_sizes_cin:
# calculate Z^(k+1) with X^k and X^0
X_0 = fluid.layers.reshape(
fluid.layers.transpose(Xs[0], [0, 2, 1]),
[-1, embedding_size, num_field,
1]) # None, embedding_size, num_field, 1
X_k = fluid.layers.reshape(
fluid.layers.transpose(Xs[-1], [0, 2, 1]),
[-1, embedding_size, 1, last_s]) # None, embedding_size, 1, last_s
Z_k_1 = fluid.layers.matmul(
X_0, X_k) # None, embedding_size, num_field, last_s
# compresses Z^(k+1) to X^(k+1)
Z_k_1 = fluid.layers.reshape(Z_k_1, [
-1, embedding_size, last_s * num_field
]) # None, embedding_size, last_s*num_field
Z_k_1 = fluid.layers.transpose(
Z_k_1, [0, 2, 1]) # None, s*num_field, embedding_size
Z_k_1 = fluid.layers.reshape(
Z_k_1, [-1, last_s * num_field, 1, embedding_size]
) # None, last_s*num_field, 1, embedding_size (None, channal_in, h, w)
X_k_1 = fluid.layers.conv2d(
Z_k_1,
num_filters=s,
filter_size=(1, 1),
act=None,
bias_attr=False,
param_attr=fluid.ParamAttr(
initializer=initer)) # None, s, 1, embedding_size
X_k_1 = fluid.layers.reshape(
X_k_1, [-1, s, embedding_size]) # None, s, embedding_size
Xs.append(X_k_1)
last_s = s
# sum pooling
y_cin = fluid.layers.concat(Xs[1:],
1) # None, (num_field++), embedding_size
y_cin = fluid.layers.reduce_sum(y_cin, -1) # None, (num_field++)
y_cin = fluid.layers.fc(input=y_cin,
size=1,
act=None,
param_attr=fluid.ParamAttr(initializer=initer),
bias_attr=None)
y_cin = fluid.layers.reduce_sum(y_cin, dim=-1, keep_dim=True)
# -------------------- DNN --------------------
y_dnn = fluid.layers.reshape(feat_embeddings,
[-1, num_field * embedding_size])
for s in layer_sizes_dnn:
y_dnn = fluid.layers.fc(input=y_dnn,
size=s,
act=act,
param_attr=fluid.ParamAttr(initializer=initer),
bias_attr=None)
y_dnn = fluid.layers.fc(input=y_dnn,
size=1,
act=None,
param_attr=fluid.ParamAttr(initializer=initer),
bias_attr=None)
# ------------------- xDeepFM ------------------
predict = fluid.layers.sigmoid(y_linear + y_cin + y_dnn)
cost = fluid.layers.log_loss(input=predict, label=label, epsilon=0.0000001)
batch_cost = fluid.layers.reduce_mean(cost)
# for auc
predict_2d = fluid.layers.concat([1 - predict, predict], 1)
label_int = fluid.layers.cast(label, 'int64')
auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict_2d,
label=label_int,
slide_steps=0)
return batch_cost, auc_var, [raw_feat_idx, raw_feat_value, label]
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册