From 48a76c01803e09a58e171d03bfd46826dc679cbe Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sat, 1 Dec 2018 12:51:16 +0800 Subject: [PATCH] update reader to add feature extend mode --- fluid/PaddleRec/ctr/network_conf.py | 10 +++++++--- fluid/PaddleRec/ctr/reader.py | 19 +++++++++++++++++-- fluid/PaddleRec/ctr/train.py | 2 +- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/fluid/PaddleRec/ctr/network_conf.py b/fluid/PaddleRec/ctr/network_conf.py index 4593c16e..bde454f6 100644 --- a/fluid/PaddleRec/ctr/network_conf.py +++ b/fluid/PaddleRec/ctr/network_conf.py @@ -3,13 +3,17 @@ import math dense_feature_dim = 13 -def ctr_dnn_model(embedding_size, sparse_feature_dim): + +def ctr_dnn_model(embedding_size, sparse_feature_dim, extend_id_range=False): dense_input = fluid.layers.data( name="dense_input", shape=[dense_feature_dim], dtype='float32') + sparse_feature_num = 26 + if extend_id_range: + sparse_feature_num = 26 + 26 * 25 sparse_input_ids = [ fluid.layers.data( name="C" + str(i), shape=[1], lod_level=1, dtype='int64') - for i in range(1, 27) + for i in range(0, sparse_feature_num) ] def embedding_layer(input): @@ -18,7 +22,7 @@ def ctr_dnn_model(embedding_size, sparse_feature_dim): is_sparse=True, # you need to patch https://github.com/PaddlePaddle/Paddle/pull/14190 # if you want to set is_distributed to True - is_distributed=False, + is_distributed=True, size=[sparse_feature_dim, embedding_size], param_attr=fluid.ParamAttr(name="SparseFeatFactors", initializer=fluid.initializer.Uniform())) diff --git a/fluid/PaddleRec/ctr/reader.py b/fluid/PaddleRec/ctr/reader.py index 851839c3..3efa021d 100644 --- a/fluid/PaddleRec/ctr/reader.py +++ b/fluid/PaddleRec/ctr/reader.py @@ -2,12 +2,15 @@ class Dataset: def __init__(self): pass + class CriteoDataset(Dataset): - def __init__(self, sparse_feature_dim): + def __init__(self, sparse_feature_dim, fix_id_range=True, extend_id_range=False): self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50] self.cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50] self.hash_dim_ = sparse_feature_dim + self.fix_id_range_ = fix_id_range + self.extend_id_range_ = extend_id_range # here, training data are lines with line_index < train_idx_ self.train_idx_ = 41256555 self.continuous_range_ = range(1, 14) @@ -35,7 +38,19 @@ class CriteoDataset(Dataset): else: dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / self.cont_diff_[idx - 1]) for idx in self.categorical_range_: - sparse_feature.append([hash("%d_%s" % (idx, features[idx])) % self.hash_dim_]) + feature_id = hash("%d_%s" % (idx, features[idx])) + if self.fix_id_range_: + feature_id = feature_id % self.hash_dim_ + sparse_feature.append([feature_id]) + if self.extend_id_range_: + for i in range(len(self.categorical_range_)): + for j in range(i + 1, len(self.categorical_range_)): + idx1 = self.categorical_range_[i] + idx2 = self.categorical_range_[j] + feature_id = hash("%d_%s_%d_%s" % (idx1, features[idx1], idx2, features[idx2])) + if self.fix_id_range_: + feature_id = feature_id % self.hash_dim_ + sparse_feature.append([feature_id]) label = [int(features[0])] yield [dense_feature] + sparse_feature + [label] diff --git a/fluid/PaddleRec/ctr/train.py b/fluid/PaddleRec/ctr/train.py index 0b4e1762..7b08bce2 100644 --- a/fluid/PaddleRec/ctr/train.py +++ b/fluid/PaddleRec/ctr/train.py @@ -148,7 +148,7 @@ def train(): if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) - loss, data_list, auc_var, batch_auc_var = ctr_dnn_model(args.embedding_size, args.sparse_feature_dim) + loss, data_list, auc_var, batch_auc_var = ctr_dnn_model(args.embedding_size, args.sparse_feature_dim, False) optimizer = fluid.optimizer.Adam(learning_rate=1e-4) optimizer.minimize(loss) if args.cloud_train: -- GitLab