From af5b746b07a83abc7eeb014d4d3a7d65a5481ffd Mon Sep 17 00:00:00 2001 From: yaoxuefeng Date: Sun, 31 May 2020 00:59:05 +0800 Subject: [PATCH] update rank yaml, model, and add infer option in rank models --- models/rank/dcn/config.yaml | 97 ++++++++----- .../data/sample_data/infer/infer_sample_data | 10 ++ models/rank/dcn/model.py | 82 +++++------ models/rank/deepfm/config.yaml | 92 +++++++----- models/rank/deepfm/model.py | 73 ++++------ models/rank/din/config.yaml | 83 ++++++----- models/rank/din/data/config.txt | 3 + models/rank/din/model.py | 133 +++++++++--------- models/rank/din/reader.py | 7 +- models/rank/wide_deep/config.yaml | 80 +++++++---- models/rank/wide_deep/model.py | 33 ++--- models/rank/xdeepfm/config.yaml | 86 ++++++----- models/rank/xdeepfm/model.py | 85 +++++------ setup.py | 3 +- 14 files changed, 482 insertions(+), 385 deletions(-) create mode 100644 models/rank/dcn/data/sample_data/infer/infer_sample_data create mode 100644 models/rank/din/data/config.txt diff --git a/models/rank/dcn/config.yaml b/models/rank/dcn/config.yaml index 58c88f0c..390b460a 100755 --- a/models/rank/dcn/config.yaml +++ b/models/rank/dcn/config.yaml @@ -12,43 +12,66 @@ # See the License for the specific language governing permissions and # limitations under the License. -train: - trainer: - # for cluster training - strategy: "async" - - epochs: 10 - workspace: "paddlerec.models.rank.dcn" - - reader: - batch_size: 2 - train_data_path: "{workspace}/data/sample_data/train" - feat_dict_name: "{workspace}/data/vocab" + +# global settings +debug: false +workspace: "paddlerec.models.rank.dcn" + +dataset: + - name: train_sample + type: QueueDataset + batch_size: 5 + data_path: "{workspace}/data/sample_data/train" + sparse_slots: "label C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26" + dense_slots: "I1:1 I2:1 I3:1 I4:1 I5:1 I6:1 I7:1 I8:1 I9:1 I10:1 I11:1 I12:1 I13:1" + - name: infer_sample + type: QueueDataset + batch_size: 5 + data_path: "{workspace}/data/sample_data/infer" sparse_slots: "label C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26" dense_slots: "I1:1 I2:1 I3:1 I4:1 I5:1 I6:1 I7:1 I8:1 I9:1 I10:1 I11:1 I12:1 I13:1" - model: - models: "{workspace}/model.py" - hyper_parameters: - cross_num: 2 - dnn_hidden_units: [128, 128] - l2_reg_cross: 0.00005 - dnn_use_bn: False - clip_by_norm: 100.0 - cat_feat_num: "{workspace}/data/sample_data/cat_feature_num.txt" - is_sparse: False - is_test: False - num_field: 39 - learning_rate: 0.0001 - act: "relu" - optimizer: adam - - save: - increment: - dirname: "increment" - epoch_interval: 2 - save_last: True - inference: - dirname: "inference" - epoch_interval: 4 - save_last: True +hyper_parameters: + optimizer: + class: Adam + learning_rate: 0.0001 + # 用户自定义配置 + cross_num: 2 + dnn_hidden_units: [128, 128] + l2_reg_cross: 0.00005 + dnn_use_bn: False + clip_by_norm: 100.0 + cat_feat_num: "{workspace}/data/sample_data/cat_feature_num.txt" + is_sparse: False + + +mode: train_runner +# if infer, change mode to "infer_runner" and change phase to "infer_phase" + +runner: + - name: train_runner + trainer_class: single_train + epochs: 1 + device: cpu + init_model_path: "" + save_checkpoint_interval: 1 + save_inference_interval: 1 + save_checkpoint_path: "increment" + save_inference_path: "inference" + print_interval: 1 + - name: infer_runner + trainer_class: single_infer + epochs: 1 + device: cpu + init_model_path: "increment/0" + print_interval: 1 + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: train_sample + thread_num: 1 +#- name: infer_phase +# model: "{workspace}/model.py" +# dataset_name: infer_sample +# thread_num: 1 diff --git a/models/rank/dcn/data/sample_data/infer/infer_sample_data b/models/rank/dcn/data/sample_data/infer/infer_sample_data new file mode 100644 index 00000000..4aa6d249 --- /dev/null +++ b/models/rank/dcn/data/sample_data/infer/infer_sample_data @@ -0,0 +1,10 @@ +label:0 I1:0.69314718056 I2:1.60943791243 I3:1.79175946923 I4:0.0 I5:7.23201033166 I6:1.60943791243 I7:2.77258872224 I8:1.09861228867 I9:5.20400668708 I10:0.69314718056 I11:1.09861228867 I12:0 I13:1.09861228867 C1:95 C2:398 C3:0 C4:0 C5:53 C6:1 C7:73 C8:71 C9:3 C10:1974 C11:832 C12:0 C13:875 C14:8 C15:1764 C16:0 C17:5 C18:390 C19:226 C20:1 C21:0 C22:0 C23:8 C24:1759 C25:1 C26:862 +label:0 I1:1.09861228867 I2:1.38629436112 I3:3.80666248977 I4:0.69314718056 I5:4.63472898823 I6:2.19722457734 I7:1.09861228867 I8:1.09861228867 I9:1.60943791243 I10:0.69314718056 I11:0.69314718056 I12:0 I13:1.60943791243 C1:95 C2:200 C3:1184 C4:1929 C5:53 C6:4 C7:1477 C8:2 C9:3 C10:1283 C11:1567 C12:1048 C13:271 C14:6 C15:1551 C16:899 C17:1 C18:162 C19:226 C20:2 C21:575 C22:0 C23:8 C24:1615 C25:1 C26:659 +label:0 I1:1.09861228867 I2:1.38629436112 I3:0.69314718056 I4:2.7080502011 I5:6.64378973315 I6:4.49980967033 I7:1.60943791243 I8:1.09861228867 I9:5.50533153593 I10:0.69314718056 I11:1.38629436112 I12:1.38629436112 I13:3.82864139649 C1:123 C2:378 C3:991 C4:197 C5:53 C6:1 C7:689 C8:2 C9:3 C10:245 C11:623 C12:1482 C13:887 C14:21 C15:106 C16:720 C17:3 C18:768 C19:0 C20:0 C21:1010 C22:1 C23:8 C24:720 C25:0 C26:0 +label:0 I1:0 I2:6.79905586206 I3:0 I4:0 I5:8.38776764398 I6:0 I7:0.0 I8:0.0 I9:0.0 I10:0 I11:0.0 I12:0 I13:0 C1:95 C2:227 C3:0 C4:219 C5:53 C6:4 C7:3174 C8:2 C9:3 C10:569 C11:1963 C12:0 C13:1150 C14:21 C15:1656 C16:0 C17:6 C18:584 C19:0 C20:0 C21:0 C22:0 C23:8 C24:954 C25:0 C26:0 +label:0 I1:1.38629436112 I2:1.09861228867 I3:0 I4:0.0 I5:1.09861228867 I6:0.0 I7:1.38629436112 I8:0.0 I9:0.0 I10:0.69314718056 I11:0.69314718056 I12:0 I13:0.0 C1:121 C2:147 C3:0 C4:1356 C5:53 C6:7 C7:2120 C8:2 C9:3 C10:703 C11:1678 C12:1210 C13:1455 C14:8 C15:538 C16:1276 C17:6 C18:346 C19:0 C20:0 C21:944 C22:0 C23:10 C24:355 C25:0 C26:0 +label:0 I1:0 I2:1.09861228867 I3:0 I4:0 I5:9.45915167004 I6:0 I7:0.0 I8:0.0 I9:1.94591014906 I10:0 I11:0.0 I12:0 I13:0 C1:14 C2:75 C3:993 C4:480 C5:50 C6:6 C7:1188 C8:2 C9:3 C10:245 C11:1037 C12:1365 C13:1421 C14:21 C15:786 C16:5 C17:2 C18:555 C19:0 C20:0 C21:1408 C22:6 C23:7 C24:753 C25:0 C26:0 +label:0 I1:0 I2:1.60943791243 I3:1.09861228867 I4:0 I5:8.06117135969 I6:0 I7:0.0 I8:0.69314718056 I9:1.09861228867 I10:0 I11:0.0 I12:0 I13:0 C1:139 C2:343 C3:553 C4:828 C5:50 C6:4 C7:0 C8:2 C9:3 C10:245 C11:2081 C12:260 C13:455 C14:21 C15:122 C16:1159 C17:2 C18:612 C19:0 C20:0 C21:1137 C22:0 C23:1 C24:1583 C25:0 C26:0 +label:1 I1:0.69314718056 I2:2.07944154168 I3:1.09861228867 I4:0.0 I5:0.0 I6:0.0 I7:0.69314718056 I8:0.0 I9:0.0 I10:0.69314718056 I11:0.69314718056 I12:0 I13:0.0 C1:95 C2:227 C3:0 C4:1567 C5:21 C6:7 C7:2496 C8:71 C9:3 C10:1913 C11:2212 C12:0 C13:673 C14:21 C15:1656 C16:0 C17:5 C18:584 C19:0 C20:0 C21:0 C22:0 C23:10 C24:954 C25:0 C26:0 +label:0 I1:0 I2:3.87120101091 I3:1.60943791243 I4:2.19722457734 I5:9.85277303799 I6:5.52146091786 I7:3.36729582999 I8:3.4657359028 I9:4.9558270576 I10:0 I11:0.69314718056 I12:0 I13:2.19722457734 C1:14 C2:14 C3:454 C4:197 C5:53 C6:1 C7:1386 C8:2 C9:3 C10:0 C11:1979 C12:205 C13:214 C14:6 C15:1837 C16:638 C17:5 C18:6 C19:0 C20:0 C21:70 C22:0 C23:10 C24:720 C25:0 C26:0 +label:0 I1:0 I2:3.66356164613 I3:0 I4:0.69314718056 I5:10.4263800775 I6:3.09104245336 I7:0.69314718056 I8:1.09861228867 I9:1.38629436112 I10:0 I11:0.69314718056 I12:0 I13:0.69314718056 C1:14 C2:179 C3:120 C4:746 C5:53 C6:0 C7:1312 C8:2 C9:3 C10:1337 C11:1963 C12:905 C13:1150 C14:21 C15:1820 C16:328 C17:9 C18:77 C19:0 C20:0 C21:311 C22:0 C23:10 C24:89 C25:0 C26:0 diff --git a/models/rank/dcn/model.py b/models/rank/dcn/model.py index 89113a31..52764c3e 100755 --- a/models/rank/dcn/model.py +++ b/models/rank/dcn/model.py @@ -24,44 +24,21 @@ class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) - def init_network(self): + def _init_hyper_parameters(self): self.cross_num = envs.get_global_env("hyper_parameters.cross_num", - None, self._namespace) + None) self.dnn_hidden_units = envs.get_global_env( - "hyper_parameters.dnn_hidden_units", None, self._namespace) + "hyper_parameters.dnn_hidden_units", None) self.l2_reg_cross = envs.get_global_env( - "hyper_parameters.l2_reg_cross", None, self._namespace) + "hyper_parameters.l2_reg_cross", None) self.dnn_use_bn = envs.get_global_env("hyper_parameters.dnn_use_bn", - None, self._namespace) + None) self.clip_by_norm = envs.get_global_env( - "hyper_parameters.clip_by_norm", None, self._namespace) - cat_feat_num = envs.get_global_env("hyper_parameters.cat_feat_num", - None, self._namespace) - - self.sparse_inputs = self._sparse_data_var[1:] - self.dense_inputs = self._dense_data_var - self.target_input = self._sparse_data_var[0] - - cat_feat_dims_dict = OrderedDict() - for line in open(cat_feat_num): - spls = line.strip().split() - assert len(spls) == 2 - cat_feat_dims_dict[spls[0]] = int(spls[1]) - self.cat_feat_dims_dict = cat_feat_dims_dict if cat_feat_dims_dict else OrderedDict( - ) + "hyper_parameters.clip_by_norm", None) + self.cat_feat_num = envs.get_global_env( + "hyper_parameters.cat_feat_num", None) self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", - None, self._namespace) - - self.dense_feat_names = [i.name for i in self.dense_inputs] - self.sparse_feat_names = [i.name for i in self.sparse_inputs] - - # {feat_name: dims} - self.feat_dims_dict = OrderedDict( - [(feat_name, 1) for feat_name in self.dense_feat_names]) - self.feat_dims_dict.update(self.cat_feat_dims_dict) - - self.net_input = None - self.loss = None + None) def _create_embedding_input(self): # sparse embedding @@ -121,9 +98,29 @@ class Model(ModelBase): def _l2_loss(self, w): return fluid.layers.reduce_sum(fluid.layers.square(w)) - def train_net(self): - self._init_slots() - self.init_network() + def net(self, inputs, is_infer=False): + self.sparse_inputs = self._sparse_data_var[1:] + self.dense_inputs = self._dense_data_var + self.target_input = self._sparse_data_var[0] + + cat_feat_dims_dict = OrderedDict() + for line in open(self.cat_feat_num): + spls = line.strip().split() + assert len(spls) == 2 + cat_feat_dims_dict[spls[0]] = int(spls[1]) + self.cat_feat_dims_dict = cat_feat_dims_dict if cat_feat_dims_dict else OrderedDict( + ) + + self.dense_feat_names = [i.name for i in self.dense_inputs] + self.sparse_feat_names = [i.name for i in self.sparse_inputs] + + # {feat_name: dims} + self.feat_dims_dict = OrderedDict( + [(feat_name, 1) for feat_name in self.dense_feat_names]) + self.feat_dims_dict.update(self.cat_feat_dims_dict) + + self.net_input = None + self.loss = None self.net_input = self._create_embedding_input() @@ -146,6 +143,9 @@ class Model(ModelBase): self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc_var + if is_infer: + self._infer_results["AUC"] = auc_var + # logloss logloss = fluid.layers.log_loss( self.prob, fluid.layers.cast( @@ -157,11 +157,7 @@ class Model(ModelBase): self.loss = self.avg_logloss + l2_reg_cross_loss self._cost = self.loss - def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", - None, self._namespace) - optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) - return optimizer - - def infer_net(self): - self.train_net() + #def optimizer(self): + # + # optimizer = fluid.optimizer.Adam(self.learning_rate, lazy_mode=True) + # return optimizer diff --git a/models/rank/deepfm/config.yaml b/models/rank/deepfm/config.yaml index 956b65b0..d1d25c2c 100755 --- a/models/rank/deepfm/config.yaml +++ b/models/rank/deepfm/config.yaml @@ -12,39 +12,65 @@ # See the License for the specific language governing permissions and # limitations under the License. -train: - trainer: - # for cluster training - strategy: "async" - - epochs: 10 - workspace: "paddlerec.models.rank.deepfm" - - reader: - batch_size: 2 - train_data_path: "{workspace}/data/sample_data/train" - feat_dict_name: "{workspace}/data/sample_data/feat_dict_10.pkl2" +# global settings +debug: false +workspace: "paddlerec.models.rank.deepfm" + + +dataset: + - name: train_sample + type: QueueDataset + batch_size: 5 + data_path: "{workspace}/data/sample_data/train" + sparse_slots: "label feat_idx" + dense_slots: "feat_value:39" + - name: infer_sample + type: QueueDataset + batch_size: 5 + data_path: "{workspace}/data/sample_data/train" sparse_slots: "label feat_idx" dense_slots: "feat_value:39" - model: - models: "{workspace}/model.py" - hyper_parameters: - sparse_feature_number: 1086460 - sparse_feature_dim: 9 - num_field: 39 - fc_sizes: [400, 400, 400] - learning_rate: 0.0001 - reg: 0.001 - act: "relu" - optimizer: SGD - - save: - increment: - dirname: "increment" - epoch_interval: 2 - save_last: True - inference: - dirname: "inference" - epoch_interval: 4 - save_last: True +hyper_parameters: + optimizer: + class: SGD + learning_rate: 0.0001 + sparse_feature_number: 1086460 + sparse_feature_dim: 9 + num_field: 39 + fc_sizes: [400, 400, 400] + reg: 0.001 + act: "relu" + + +mode: train_runner +# if infer, change mode to "infer_runner" and change phase to "infer_phase" + +runner: + - name: train_runner + trainer_class: single_train + epochs: 2 + device: cpu + init_model_path: "" + save_checkpoint_interval: 1 + save_inference_interval: 1 + save_checkpoint_path: "increment" + save_inference_path: "inference" + print_interval: 1 + - name: infer_runner + trainer_class: single_infer + epochs: 1 + device: cpu + init_model_path: "increment/0" + print_interval: 1 + + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: train_sample + thread_num: 1 +#- name: infer_phase +# model: "{workspace}/model.py" +# dataset_name: infer_sample +# thread_num: 1 diff --git a/models/rank/deepfm/model.py b/models/rank/deepfm/model.py index deb63c40..8ac8df13 100755 --- a/models/rank/deepfm/model.py +++ b/models/rank/deepfm/model.py @@ -24,42 +24,46 @@ class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) - def deepfm_net(self): + def _init_hyper_parameters(self): + self.sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number", None) + self.sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim", None) + self.num_field = envs.get_global_env("hyper_parameters.num_field", + None) + self.reg = envs.get_global_env("hyper_parameters.reg", 1e-4) + self.layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", + None) + self.act = envs.get_global_env("hyper_parameters.act", None) + + def net(self, inputs, is_infer=False): init_value_ = 0.1 is_distributed = True if envs.get_trainer() == "CtrTrainer" else False - sparse_feature_number = envs.get_global_env( - "hyper_parameters.sparse_feature_number", None, self._namespace) - sparse_feature_dim = envs.get_global_env( - "hyper_parameters.sparse_feature_dim", None, self._namespace) # ------------------------- network input -------------------------- - num_field = envs.get_global_env("hyper_parameters.num_field", None, - self._namespace) - raw_feat_idx = self._sparse_data_var[1] raw_feat_value = self._dense_data_var[0] self.label = self._sparse_data_var[0] feat_idx = raw_feat_idx feat_value = fluid.layers.reshape( - raw_feat_value, [-1, num_field, 1]) # None * num_field * 1 + raw_feat_value, [-1, self.num_field, 1]) # None * num_field * 1 - reg = envs.get_global_env("hyper_parameters.reg", 1e-4, - self._namespace) first_weights_re = fluid.embedding( input=feat_idx, is_sparse=True, is_distributed=is_distributed, dtype='float32', - size=[sparse_feature_number + 1, 1], + size=[self.sparse_feature_number + 1, 1], padding_idx=0, param_attr=fluid.ParamAttr( initializer=fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=init_value_), - regularizer=fluid.regularizer.L1DecayRegularizer(reg))) + regularizer=fluid.regularizer.L1DecayRegularizer(self.reg))) first_weights = fluid.layers.reshape( - first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1 + first_weights_re, + shape=[-1, self.num_field, 1]) # None * num_field * 1 y_first_order = fluid.layers.reduce_sum((first_weights * feat_value), 1) @@ -70,16 +74,17 @@ class Model(ModelBase): is_sparse=True, is_distributed=is_distributed, dtype='float32', - size=[sparse_feature_number + 1, sparse_feature_dim], + size=[self.sparse_feature_number + 1, self.sparse_feature_dim], padding_idx=0, param_attr=fluid.ParamAttr( initializer=fluid.initializer.TruncatedNormalInitializer( loc=0.0, - scale=init_value_ / math.sqrt(float(sparse_feature_dim))))) + scale=init_value_ / + math.sqrt(float(self.sparse_feature_dim))))) feat_embeddings = fluid.layers.reshape( feat_embeddings_re, - shape=[-1, num_field, - sparse_feature_dim]) # None * num_field * embedding_size + shape=[-1, self.num_field, self.sparse_feature_dim + ]) # None * num_field * embedding_size feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size # sum_square part @@ -101,17 +106,13 @@ class Model(ModelBase): # ------------------------- DNN -------------------------- - layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None, - self._namespace) - act = envs.get_global_env("hyper_parameters.act", None, - self._namespace) - y_dnn = fluid.layers.reshape(feat_embeddings, - [-1, num_field * sparse_feature_dim]) - for s in layer_sizes: + y_dnn = fluid.layers.reshape( + feat_embeddings, [-1, self.num_field * self.sparse_feature_dim]) + for s in self.layer_sizes: y_dnn = fluid.layers.fc( input=y_dnn, size=s, - act=act, + act=self.act, param_attr=fluid.ParamAttr( initializer=fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=init_value_ / math.sqrt(float(10)))), @@ -133,21 +134,12 @@ class Model(ModelBase): self.predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn) - - def train_net(self): - self._init_slots() - self.deepfm_net() - - # ------------------------- Cost(logloss) -------------------------- - cost = fluid.layers.log_loss( input=self.predict, label=fluid.layers.cast(self.label, "float32")) avg_cost = fluid.layers.reduce_sum(cost) self._cost = avg_cost - # ------------------------- Metric(Auc) -------------------------- - predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1) label_int = fluid.layers.cast(self.label, 'int64') auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d, @@ -155,12 +147,5 @@ class Model(ModelBase): slide_steps=0) self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc_var - - def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", - None, self._namespace) - optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) - return optimizer - - def infer_net(self): - self.train_net() + if is_infer: + self._infer_results["AUC"] = auc_var diff --git a/models/rank/din/config.yaml b/models/rank/din/config.yaml index bdf56b82..e61e4636 100755 --- a/models/rank/din/config.yaml +++ b/models/rank/din/config.yaml @@ -12,40 +12,57 @@ # See the License for the specific language governing permissions and # limitations under the License. -train: - trainer: - # for cluster training - strategy: "async" +# global settings +debug: false +workspace: "paddlerec.models.rank.din" - epochs: 10 - workspace: "paddlerec.models.rank.din" +dataset: + - name: sample_1 + type: DataLoader + batch_size: 5 + data_path: "{workspace}/data/train_data" + data_converter: "{workspace}/reader.py" + - name: infer_sample + type: DataLoader + batch_size: 5 + data_path: "{workspace}/data/train_data" + data_converter: "{workspace}/reader.py" - reader: - batch_size: 2 - class: "{workspace}/reader.py" - train_data_path: "{workspace}/data/train_data" - dataset_class: "DataLoader" +hyper_parameters: + optimizer: + class: SGD + learning_rate: 0.0001 + use_DataLoader: True + item_emb_size: 64 + cat_emb_size: 64 + is_sparse: False + config_path: "{workspace}/data/config.txt" + act: "sigmoid" - model: - models: "{workspace}/model.py" - hyper_parameters: - use_DataLoader: True - item_emb_size: 64 - cat_emb_size: 64 - is_sparse: False - config_path: "data/config.txt" - fc_sizes: [400, 400, 400] - learning_rate: 0.0001 - reg: 0.001 - act: "sigmoid" - optimizer: SGD - save: - increment: - dirname: "increment" - epoch_interval: 2 - save_last: True - inference: - dirname: "inference" - epoch_interval: 4 - save_last: True +mode: train_runner + +runner: + - name: train_runner + trainer_class: single_train + epochs: 1 + device: cpu + init_model_path: "" + save_checkpoint_interval: 1 + save_inference_interval: 1 + save_checkpoint_path: "increment" + save_inference_path: "inference" + - name: infer_runner + trainer_class: single_infer + epochs: 1 + device: cpu + init_model_path: "increment/0" +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: sample_1 + thread_num: 1 +#- name: infer_phase +# model: "{workspace}/model.py" +# dataset_name: infer_sample +# thread_num: 1 diff --git a/models/rank/din/data/config.txt b/models/rank/din/data/config.txt new file mode 100644 index 00000000..8552fb4c --- /dev/null +++ b/models/rank/din/data/config.txt @@ -0,0 +1,3 @@ +192403 +63001 +801 diff --git a/models/rank/din/model.py b/models/rank/din/model.py index c2acbe66..9be8f9e7 100755 --- a/models/rank/din/model.py +++ b/models/rank/din/model.py @@ -22,6 +22,64 @@ class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) + def _init_hyper_parameters(self): + self.item_emb_size = envs.get_global_env( + "hyper_parameters.item_emb_size", 64) + self.cat_emb_size = envs.get_global_env( + "hyper_parameters.cat_emb_size", 64) + self.act = envs.get_global_env("hyper_parameters.act", "sigmoid") + self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", + False) + #significant for speeding up the training process + self.config_path = envs.get_global_env("hyper_parameters.config_path", + "data/config.txt") + self.use_DataLoader = envs.get_global_env( + "hyper_parameters.use_DataLoader", False) + + def input_data(self, is_infer=False, **kwargs): + seq_len = -1 + self.data_var = [] + hist_item_seq = fluid.data( + name="hist_item_seq", shape=[None, seq_len], dtype="int64") + self.data_var.append(hist_item_seq) + + hist_cat_seq = fluid.data( + name="hist_cat_seq", shape=[None, seq_len], dtype="int64") + self.data_var.append(hist_cat_seq) + + target_item = fluid.data( + name="target_item", shape=[None], dtype="int64") + self.data_var.append(target_item) + + target_cat = fluid.data(name="target_cat", shape=[None], dtype="int64") + self.data_var.append(target_cat) + + label = fluid.data(name="label", shape=[None, 1], dtype="float32") + self.data_var.append(label) + + mask = fluid.data( + name="mask", shape=[None, seq_len, 1], dtype="float32") + self.data_var.append(mask) + + target_item_seq = fluid.data( + name="target_item_seq", shape=[None, seq_len], dtype="int64") + self.data_var.append(target_item_seq) + + target_cat_seq = fluid.data( + name="target_cat_seq", shape=[None, seq_len], dtype="int64") + self.data_var.append(target_cat_seq) + + #if self.use_DataLoader: + # self._data_loader = fluid.io.DataLoader.from_generator( + # feed_list=self.data_var, + # capacity=10000, + # use_double_buffer=False, + # iterable=False) + train_inputs = [hist_item_seq] + [hist_cat_seq] + [target_item] + [ + target_cat + ] + [label] + [mask] + [target_item_seq] + [target_cat_seq] + return train_inputs + def config_read(self, config_path): with open(config_path, "r") as fin: user_count = int(fin.readline().strip()) @@ -59,65 +117,21 @@ class Model(ModelBase): out = fluid.layers.reshape(x=out, shape=[0, hidden_size]) return out - def train_net(self): - seq_len = -1 - self.item_emb_size = envs.get_global_env( - "hyper_parameters.item_emb_size", 64, self._namespace) - self.cat_emb_size = envs.get_global_env( - "hyper_parameters.cat_emb_size", 64, self._namespace) - self.act = envs.get_global_env("hyper_parameters.act", "sigmoid", - self._namespace) - #item_emb_size = 64 - #cat_emb_size = 64 - self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", - False, self._namespace) - #significant for speeding up the training process - self.config_path = envs.get_global_env( - "hyper_parameters.config_path", "data/config.txt", self._namespace) - self.use_DataLoader = envs.get_global_env( - "hyper_parameters.use_DataLoader", False, self._namespace) + def net(self, inputs, is_infer=False): + hist_item_seq = inputs[0] + hist_cat_seq = inputs[1] + target_item = inputs[2] + target_cat = inputs[3] + label = inputs[4] + mask = inputs[5] + target_item_seq = inputs[6] + target_cat_seq = inputs[7] + user_count, item_count, cat_count = self.config_read(self.config_path) item_emb_attr = fluid.ParamAttr(name="item_emb") cat_emb_attr = fluid.ParamAttr(name="cat_emb") - hist_item_seq = fluid.data( - name="hist_item_seq", shape=[None, seq_len], dtype="int64") - self._data_var.append(hist_item_seq) - - hist_cat_seq = fluid.data( - name="hist_cat_seq", shape=[None, seq_len], dtype="int64") - self._data_var.append(hist_cat_seq) - - target_item = fluid.data( - name="target_item", shape=[None], dtype="int64") - self._data_var.append(target_item) - - target_cat = fluid.data(name="target_cat", shape=[None], dtype="int64") - self._data_var.append(target_cat) - - label = fluid.data(name="label", shape=[None, 1], dtype="float32") - self._data_var.append(label) - - mask = fluid.data( - name="mask", shape=[None, seq_len, 1], dtype="float32") - self._data_var.append(mask) - - target_item_seq = fluid.data( - name="target_item_seq", shape=[None, seq_len], dtype="int64") - self._data_var.append(target_item_seq) - - target_cat_seq = fluid.data( - name="target_cat_seq", shape=[None, seq_len], dtype="int64") - self._data_var.append(target_cat_seq) - - if self.use_DataLoader: - self._data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._data_var, - capacity=10000, - use_double_buffer=False, - iterable=False) - hist_item_emb = fluid.embedding( input=hist_item_seq, size=[item_count, self.item_emb_size], @@ -195,12 +209,5 @@ class Model(ModelBase): slide_steps=0) self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc_var - - def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", - None, self._namespace) - optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) - return optimizer - - def infer_net(self, parameter_list): - self.deepfm_net() + if is_infer: + self._infer_results["AUC"] = auc_var diff --git a/models/rank/din/reader.py b/models/rank/din/reader.py index aba06141..90d358b9 100755 --- a/models/rank/din/reader.py +++ b/models/rank/din/reader.py @@ -29,8 +29,8 @@ from paddlerec.core.utils import envs class TrainReader(Reader): def init(self): - self.train_data_path = envs.get_global_env("train_data_path", None, - "train.reader") + self.train_data_path = envs.get_global_env( + "dataset.sample_1.data_path", None) self.res = [] self.max_len = 0 @@ -46,7 +46,8 @@ class TrainReader(Reader): fo = open("tmp.txt", "w") fo.write(str(self.max_len)) fo.close() - self.batch_size = envs.get_global_env("batch_size", 32, "train.reader") + self.batch_size = envs.get_global_env("dataset.sample_1.batch_size", + 32, "train.reader") self.group_size = self.batch_size * 20 def _process_line(self, line): diff --git a/models/rank/wide_deep/config.yaml b/models/rank/wide_deep/config.yaml index 9cadddf2..af9e106e 100755 --- a/models/rank/wide_deep/config.yaml +++ b/models/rank/wide_deep/config.yaml @@ -12,37 +12,59 @@ # See the License for the specific language governing permissions and # limitations under the License. -train: - trainer: - # for cluster training - strategy: "async" +# global settings +debug: false +workspace: "paddlerec.models.rank.wide_deep" - epochs: 10 - workspace: "paddlerec.models.rank.wide_deep" - reader: - batch_size: 2 - train_data_path: "{workspace}/data/sample_data/train" +dataset: + - name: sample_1 + type: QueueDataset + batch_size: 5 + data_path: "{workspace}/data/sample_data/train" sparse_slots: "label" dense_slots: "wide_input:8 deep_input:58" + - name: infer_sample + type: QueueDataset + batch_size: 5 + data_path: "{workspace}/data/sample_data/train" + sparse_slots: "label" + dense_slots: "wide_input:8 deep_input:58" + +hyper_parameters: + optimizer: + class: SGD + learning_rate: 0.0001 + hidden1_units: 75 + hidden2_units: 50 + hidden3_units: 25 + + +mode: train_runner +# if infer, change mode to "infer_runner" and change phase to "infer_phase" + +runner: + - name: train_runner + trainer_class: single_train + epochs: 1 + device: cpu + init_model_path: "" + save_checkpoint_interval: 1 + save_inference_interval: 1 + save_checkpoint_path: "increment" + save_inference_path: "inference" + - name: infer_runner + trainer_class: single_infer + epochs: 1 + device: cpu + init_model_path: "increment/0" - model: - models: "{workspace}/model.py" - hyper_parameters: - hidden1_units: 75 - hidden2_units: 50 - hidden3_units: 25 - learning_rate: 0.0001 - reg: 0.001 - act: "relu" - optimizer: SGD - - save: - increment: - dirname: "increment" - epoch_interval: 2 - save_last: True - inference: - dirname: "inference" - epoch_interval: 4 - save_last: True +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: sample_1 + thread_num: 1 +#- name: infer_phase +# model: "{workspace}/model.py" +# dataset_name: infer_sample +# thread_num: 1 diff --git a/models/rank/wide_deep/model.py b/models/rank/wide_deep/model.py index d798a545..e9d4da60 100755 --- a/models/rank/wide_deep/model.py +++ b/models/rank/wide_deep/model.py @@ -24,6 +24,14 @@ class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) + def _init_hyper_parameters(self): + self.hidden1_units = envs.get_global_env( + "hyper_parameters.hidden1_units", 75) + self.hidden2_units = envs.get_global_env( + "hyper_parameters.hidden2_units", 50) + self.hidden3_units = envs.get_global_env( + "hyper_parameters.hidden3_units", 25) + def wide_part(self, data): out = fluid.layers.fc( input=data, @@ -56,21 +64,14 @@ class Model(ModelBase): return l3 - def train_net(self): - self._init_slots() + def net(self, inputs, is_infer=False): wide_input = self._dense_data_var[0] deep_input = self._dense_data_var[1] label = self._sparse_data_var[0] - hidden1_units = envs.get_global_env("hyper_parameters.hidden1_units", - 75, self._namespace) - hidden2_units = envs.get_global_env("hyper_parameters.hidden2_units", - 50, self._namespace) - hidden3_units = envs.get_global_env("hyper_parameters.hidden3_units", - 25, self._namespace) wide_output = self.wide_part(wide_input) - deep_output = self.deep_part(deep_input, hidden1_units, hidden2_units, - hidden3_units) + deep_output = self.deep_part(deep_input, self.hidden1_units, + self.hidden2_units, self.hidden3_units) wide_model = fluid.layers.fc( input=wide_output, @@ -109,18 +110,12 @@ class Model(ModelBase): self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc self._metrics["ACC"] = acc + if is_infer: + self._infer_results["AUC"] = auc_var + self._infer_results["ACC"] = acc cost = fluid.layers.sigmoid_cross_entropy_with_logits( x=prediction, label=fluid.layers.cast( label, dtype='float32')) avg_cost = fluid.layers.mean(cost) self._cost = avg_cost - - def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", - None, self._namespace) - optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) - return optimizer - - def infer_net(self): - self.train_net() diff --git a/models/rank/xdeepfm/config.yaml b/models/rank/xdeepfm/config.yaml index 37b6b65b..3e576111 100755 --- a/models/rank/xdeepfm/config.yaml +++ b/models/rank/xdeepfm/config.yaml @@ -11,41 +11,61 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +debug: false +workspace: "paddlerec.models.rank.xdeepfm" -train: - trainer: - # for cluster training - strategy: "async" - - epochs: 10 - workspace: "paddlerec.models.rank.xdeepfm" - - reader: - batch_size: 2 - train_data_path: "{workspace}/data/sample_data/train" +dataset: + - name: sample_1 + type: QueueDataset #或者QueueDataset + batch_size: 5 + data_path: "{workspace}/data/sample_data/train" + sparse_slots: "label feat_idx" + dense_slots: "feat_value:39" + - name: infer_sample + type: QueueDataset #或者QueueDataset + batch_size: 5 + data_path: "{workspace}/data/sample_data/train" sparse_slots: "label feat_idx" dense_slots: "feat_value:39" - model: - models: "{workspace}/model.py" - hyper_parameters: - layer_sizes_dnn: [10, 10, 10] - layer_sizes_cin: [10, 10] - sparse_feature_number: 1086460 - sparse_feature_dim: 9 - num_field: 39 - fc_sizes: [400, 400, 400] - learning_rate: 0.0001 - reg: 0.0001 - act: "relu" - optimizer: SGD +hyper_parameters: + optimizer: + class: SGD + learning_rate: 0.0001 + layer_sizes_dnn: [10, 10, 10] + layer_sizes_cin: [10, 10] + sparse_feature_number: 1086460 + sparse_feature_dim: 9 + num_field: 39 + fc_sizes: [400, 400, 400] + act: "relu" + + +mode: train_runner +# if infer, change mode to "infer_runner" and change phase to "infer_phase" + +runner: + - name: train_runner + trainer_class: single_train + epochs: 1 + device: cpu + init_model_path: "" + save_checkpoint_interval: 1 + save_inference_interval: 1 + save_checkpoint_path: "increment" + save_inference_path: "inference" + - name: infer_runner + trainer_class: single_infer + epochs: 1 + device: cpu + init_model_path: "increment/0" - save: - increment: - dirname: "increment" - epoch_interval: 2 - save_last: True - inference: - dirname: "inference" - epoch_interval: 4 - save_last: True +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: sample_1 + thread_num: 1 +#- name: infer_phase +# model: "{workspace}/model.py" +# dataset_name: infer_sample +# thread_num: 1 diff --git a/models/rank/xdeepfm/model.py b/models/rank/xdeepfm/model.py index 23443c7d..4ca057bd 100755 --- a/models/rank/xdeepfm/model.py +++ b/models/rank/xdeepfm/model.py @@ -22,38 +22,45 @@ class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) - def xdeepfm_net(self): + def _init_hyper_parameters(self): + self.sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number", None) + self.sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim", None) + self.num_field = envs.get_global_env("hyper_parameters.num_field", + None) + self.layer_sizes_cin = envs.get_global_env( + "hyper_parameters.layer_sizes_cin", None) + self.layer_sizes_dnn = envs.get_global_env( + "hyper_parameters.layer_sizes_dnn", None) + self.act = envs.get_global_env("hyper_parameters.act", None) + + def net(self, inputs, is_infer=False): + raw_feat_idx = self._sparse_data_var[1] + raw_feat_value = self._dense_data_var[0] + self.label = self._sparse_data_var[0] + init_value_ = 0.1 initer = fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=init_value_) is_distributed = True if envs.get_trainer() == "CtrTrainer" else False - sparse_feature_number = envs.get_global_env( - "hyper_parameters.sparse_feature_number", None, self._namespace) - sparse_feature_dim = envs.get_global_env( - "hyper_parameters.sparse_feature_dim", None, self._namespace) # ------------------------- network input -------------------------- - num_field = envs.get_global_env("hyper_parameters.num_field", None, - self._namespace) - raw_feat_idx = self._sparse_data_var[1] - raw_feat_value = self._dense_data_var[0] - self.label = self._sparse_data_var[0] - feat_idx = raw_feat_idx feat_value = fluid.layers.reshape( - raw_feat_value, [-1, num_field, 1]) # None * num_field * 1 + raw_feat_value, [-1, self.num_field, 1]) # None * num_field * 1 feat_embeddings = fluid.embedding( input=feat_idx, is_sparse=True, dtype='float32', - size=[sparse_feature_number + 1, sparse_feature_dim], + size=[self.sparse_feature_number + 1, self.sparse_feature_dim], padding_idx=0, param_attr=fluid.ParamAttr(initializer=initer)) feat_embeddings = fluid.layers.reshape(feat_embeddings, [ - -1, num_field, sparse_feature_dim + -1, self.num_field, self.sparse_feature_dim ]) # None * num_field * embedding_size feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size @@ -63,11 +70,11 @@ class Model(ModelBase): input=feat_idx, is_sparse=True, dtype='float32', - size=[sparse_feature_number + 1, 1], + size=[self.sparse_feature_number + 1, 1], padding_idx=0, param_attr=fluid.ParamAttr(initializer=initer)) weights_linear = fluid.layers.reshape( - weights_linear, [-1, num_field, 1]) # None * num_field * 1 + weights_linear, [-1, self.num_field, 1]) # None * num_field * 1 b_linear = fluid.layers.create_parameter( shape=[1], dtype='float32', @@ -77,31 +84,30 @@ class Model(ModelBase): # -------------------- CIN -------------------- - layer_sizes_cin = envs.get_global_env( - "hyper_parameters.layer_sizes_cin", None, self._namespace) Xs = [feat_embeddings] - last_s = num_field - for s in layer_sizes_cin: + last_s = self.num_field + for s in self.layer_sizes_cin: # calculate Z^(k+1) with X^k and X^0 X_0 = fluid.layers.reshape( fluid.layers.transpose(Xs[0], [0, 2, 1]), - [-1, sparse_feature_dim, num_field, + [-1, self.sparse_feature_dim, self.num_field, 1]) # None, embedding_size, num_field, 1 X_k = fluid.layers.reshape( fluid.layers.transpose(Xs[-1], [0, 2, 1]), - [-1, sparse_feature_dim, 1, + [-1, self.sparse_feature_dim, 1, last_s]) # None, embedding_size, 1, last_s Z_k_1 = fluid.layers.matmul( X_0, X_k) # None, embedding_size, num_field, last_s # compresses Z^(k+1) to X^(k+1) Z_k_1 = fluid.layers.reshape(Z_k_1, [ - -1, sparse_feature_dim, last_s * num_field + -1, self.sparse_feature_dim, last_s * self.num_field ]) # None, embedding_size, last_s*num_field Z_k_1 = fluid.layers.transpose( Z_k_1, [0, 2, 1]) # None, s*num_field, embedding_size Z_k_1 = fluid.layers.reshape( - Z_k_1, [-1, last_s * num_field, 1, sparse_feature_dim] + Z_k_1, + [-1, last_s * self.num_field, 1, self.sparse_feature_dim] ) # None, last_s*num_field, 1, embedding_size (None, channal_in, h, w) X_k_1 = fluid.layers.conv2d( Z_k_1, @@ -112,7 +118,8 @@ class Model(ModelBase): param_attr=fluid.ParamAttr( initializer=initer)) # None, s, 1, embedding_size X_k_1 = fluid.layers.reshape( - X_k_1, [-1, s, sparse_feature_dim]) # None, s, embedding_size + X_k_1, + [-1, s, self.sparse_feature_dim]) # None, s, embedding_size Xs.append(X_k_1) last_s = s @@ -130,17 +137,13 @@ class Model(ModelBase): # -------------------- DNN -------------------- - layer_sizes_dnn = envs.get_global_env( - "hyper_parameters.layer_sizes_dnn", None, self._namespace) - act = envs.get_global_env("hyper_parameters.act", None, - self._namespace) - y_dnn = fluid.layers.reshape(feat_embeddings, - [-1, num_field * sparse_feature_dim]) - for s in layer_sizes_dnn: + y_dnn = fluid.layers.reshape( + feat_embeddings, [-1, self.num_field * self.sparse_feature_dim]) + for s in self.layer_sizes_dnn: y_dnn = fluid.layers.fc( input=y_dnn, size=s, - act=act, + act=self.act, param_attr=fluid.ParamAttr(initializer=initer), bias_attr=None) y_dnn = fluid.layers.fc(input=y_dnn, @@ -152,11 +155,6 @@ class Model(ModelBase): # ------------------- xDeepFM ------------------ self.predict = fluid.layers.sigmoid(y_linear + y_cin + y_dnn) - - def train_net(self): - self._init_slots() - self.xdeepfm_net() - cost = fluid.layers.log_loss( input=self.predict, label=fluid.layers.cast(self.label, "float32"), @@ -172,12 +170,5 @@ class Model(ModelBase): slide_steps=0) self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc_var - - def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", - None, self._namespace) - optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) - return optimizer - - def infer_net(self): - self.train_net() + if is_infer: + self._infer_results["AUC"] = auc_var diff --git a/setup.py b/setup.py index f64ae5cf..d523f409 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,8 @@ def build(dirname): models_copy = [ 'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy', - 'tree/*.txt', 'data/sample_data/*', 'data/sample_data/train/*' + 'tree/*.txt', 'data/sample_data/*', 'data/sample_data/train/*', + 'data/sample_data/infer/*' ] engine_copy = ['*/*.sh'] -- GitLab