提交 48a76c01 编写于 作者: Q Qiao Longfei

update reader to add feature extend mode

上级 ad7ba363
......@@ -3,13 +3,17 @@ import math
dense_feature_dim = 13
def ctr_dnn_model(embedding_size, sparse_feature_dim):
def ctr_dnn_model(embedding_size, sparse_feature_dim, extend_id_range=False):
dense_input = fluid.layers.data(
name="dense_input", shape=[dense_feature_dim], dtype='float32')
sparse_feature_num = 26
if extend_id_range:
sparse_feature_num = 26 + 26 * 25
sparse_input_ids = [
fluid.layers.data(
name="C" + str(i), shape=[1], lod_level=1, dtype='int64')
for i in range(1, 27)
for i in range(0, sparse_feature_num)
]
def embedding_layer(input):
......@@ -18,7 +22,7 @@ def ctr_dnn_model(embedding_size, sparse_feature_dim):
is_sparse=True,
# you need to patch https://github.com/PaddlePaddle/Paddle/pull/14190
# if you want to set is_distributed to True
is_distributed=False,
is_distributed=True,
size=[sparse_feature_dim, embedding_size],
param_attr=fluid.ParamAttr(name="SparseFeatFactors", initializer=fluid.initializer.Uniform()))
......
......@@ -2,12 +2,15 @@ class Dataset:
def __init__(self):
pass
class CriteoDataset(Dataset):
def __init__(self, sparse_feature_dim):
def __init__(self, sparse_feature_dim, fix_id_range=True, extend_id_range=False):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
self.cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
self.hash_dim_ = sparse_feature_dim
self.fix_id_range_ = fix_id_range
self.extend_id_range_ = extend_id_range
# here, training data are lines with line_index < train_idx_
self.train_idx_ = 41256555
self.continuous_range_ = range(1, 14)
......@@ -35,7 +38,19 @@ class CriteoDataset(Dataset):
else:
dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / self.cont_diff_[idx - 1])
for idx in self.categorical_range_:
sparse_feature.append([hash("%d_%s" % (idx, features[idx])) % self.hash_dim_])
feature_id = hash("%d_%s" % (idx, features[idx]))
if self.fix_id_range_:
feature_id = feature_id % self.hash_dim_
sparse_feature.append([feature_id])
if self.extend_id_range_:
for i in range(len(self.categorical_range_)):
for j in range(i + 1, len(self.categorical_range_)):
idx1 = self.categorical_range_[i]
idx2 = self.categorical_range_[j]
feature_id = hash("%d_%s_%d_%s" % (idx1, features[idx1], idx2, features[idx2]))
if self.fix_id_range_:
feature_id = feature_id % self.hash_dim_
sparse_feature.append([feature_id])
label = [int(features[0])]
yield [dense_feature] + sparse_feature + [label]
......
......@@ -148,7 +148,7 @@ def train():
if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir)
loss, data_list, auc_var, batch_auc_var = ctr_dnn_model(args.embedding_size, args.sparse_feature_dim)
loss, data_list, auc_var, batch_auc_var = ctr_dnn_model(args.embedding_size, args.sparse_feature_dim, False)
optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
optimizer.minimize(loss)
if args.cloud_train:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册