Merge pull request #34 from yaoxuefeng6/add_nfm

add nfm in rank and clean unused codes and files

Merge pull request #34 from yaoxuefeng6/add_nfm
add nfm in rank and clean unused codes and files
f62d94a1 · wuzhihua · GitHub · d14f44da · 48294cc7 · f62d94a1
12 changed file
--- a/models/rank/dcn/model.py
+++ b/models/rank/dcn/model.py
@@ -156,8 +156,3 @@ class Model(ModelBase):
        l2_reg_cross_loss = self.l2_reg_cross * l2_reg_cross_loss
        self.loss = self.avg_logloss + l2_reg_cross_loss
        self._cost = self.loss
-    #def optimizer(self):
-    #    
-    #    optimizer = fluid.optimizer.Adam(self.learning_rate, lazy_mode=True)
-    #    return optimizer
--- a/models/rank/deepfm/data/download_preprocess.py
+++ b/models/rank/deepfm/data/download_preprocess.py
@@ -28,7 +28,7 @@ if __name__ == '__main__':
    print("download and extract starting...")
    download_file_and_uncompress(url)
-    download_file(url2, "./aid_data/feat_dict_10.pkl2", True)
+    download_file(url2, "./sample_data/feat_dict_10.pkl2", True)
    print("download and extract finished")
    print("preprocessing...")

--- a/models/rank/deepfm/data/sample_data/feat_dict_10.pkl2
+++ b/models/rank/deepfm/data/sample_data/feat_dict_10.pkl2
--- a/models/rank/nfm/__init__.py
+++ b/models/rank/nfm/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/models/rank/nfm/config.yaml
+++ b/models/rank/nfm/config.yaml
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# global settings 
+debug: false
+workspace: "paddlerec.models.rank.nfm"
+dataset:
+  - name: train_sample
+    type: QueueDataset
+    batch_size: 5
+    data_path: "{workspace}/data/sample_data/train"
+    sparse_slots: "label feat_idx"
+    dense_slots: "feat_value:39"
+  - name: infer_sample
+    type: QueueDataset
+    batch_size: 5
+    data_path: "{workspace}/data/sample_data/train"
+    sparse_slots: "label feat_idx"
+    dense_slots: "feat_value:39"
+hyper_parameters:
+    # 用户自定义配置
+    optimizer:
+        class: Adam
+        learning_rate: 0.0001
+    sparse_feature_number: 1086460
+    sparse_feature_dim: 9
+    is_sparse: False
+    use_batchnorm: False
+    use_dropout: False
+    dropout_prob: 0.9
+    fc_sizes: [400, 400, 400]
+    loss_type: "log_loss" # log_loss or square_loss
+    reg: 0.001
+    num_field: 39
+    act: "relu"
+mode: train_runner
+# if infer, change mode to "infer_runner" and change phase to "infer_phase"
+runner:
+  - name: train_runner
+    trainer_class: single_train
+    epochs: 1
+    device: cpu
+    init_model_path: ""
+    save_checkpoint_interval: 1
+    save_inference_interval: 1
+    save_checkpoint_path: "increment"
+    save_inference_path: "inference"
+    print_interval: 1
+  - name: infer_runner
+    trainer_class: single_infer
+    epochs: 1
+    device: cpu
+    init_model_path: "increment/0"
+    print_interval: 1
+phase:
+- name: phase1
+  model: "{workspace}/model.py"
+  dataset_name: train_sample
+  thread_num: 1
+#- name: infer_phase
+#  model: "{workspace}/model.py"
+#  dataset_name: infer_sample
+#  thread_num: 1
--- a/models/rank/nfm/data/download.py
+++ b/models/rank/nfm/data/download.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import io
+LOCAL_PATH = os.path.dirname(os.path.abspath(__file__))
+TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools")
+sys.path.append(TOOLS_PATH)
+from paddlerec.tools.tools import download_file_and_uncompress
+if __name__ == '__main__':
+    trainfile = 'train.txt'
+    url = "https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz"
+    print("download and extract starting...")
+    download_file_and_uncompress(url)
+    print("download and extract finished")
+    count = 0
+    for _ in io.open(trainfile, 'r', encoding='utf-8'):
+        count += 1
+    print("total records: %d" % count)
+    print("done")
--- a/models/rank/nfm/data/download_preprocess.py
+++ b/models/rank/nfm/data/download_preprocess.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import shutil
+import sys
+LOCAL_PATH = os.path.dirname(os.path.abspath(__file__))
+TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools")
+sys.path.append(TOOLS_PATH)
+from paddlerec.tools.tools import download_file_and_uncompress, download_file
+if __name__ == '__main__':
+    url = "https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz"
+    url2 = "https://paddlerec.bj.bcebos.com/deepfm%2Ffeat_dict_10.pkl2"
+    print("download and extract starting...")
+    download_file_and_uncompress(url)
+    download_file(url2, "./sample_data/feat_dict_10.pkl2", True)
+    print("download and extract finished")
+    print("preprocessing...")
+    os.system("python preprocess.py")
+    print("preprocess done")
+    shutil.rmtree("raw_data")
+    print("done")
--- a/models/rank/nfm/data/get_slot_data.py
+++ b/models/rank/nfm/data/get_slot_data.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import yaml, os
+from paddlerec.core.reader import Reader
+from paddlerec.core.utils import envs
+import paddle.fluid.incubate.data_generator as dg
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+class TrainReader(dg.MultiSlotDataGenerator):
+    def __init__(self, config):
+        dg.MultiSlotDataGenerator.__init__(self)
+        if os.path.isfile(config):
+            with open(config, 'r') as rb:
+                _config = yaml.load(rb.read(), Loader=yaml.FullLoader)
+        else:
+            raise ValueError("reader config only support yaml")
+    def init(self):
+        self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        self.cont_max_ = [
+            5775, 257675, 65535, 969, 23159456, 431037, 56311, 6047, 29019, 46,
+            231, 4008, 7393
+        ]
+        self.cont_diff_ = [
+            self.cont_max_[i] - self.cont_min_[i]
+            for i in range(len(self.cont_min_))
+        ]
+        self.continuous_range_ = range(1, 14)
+        self.categorical_range_ = range(14, 40)
+        # load preprocessed feature dict 
+        self.feat_dict_name = "sample_data/feat_dict_10.pkl2"
+        self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
+    def _process_line(self, line):
+        features = line.rstrip('\n').split('\t')
+        feat_idx = []
+        feat_value = []
+        for idx in self.continuous_range_:
+            if features[idx] == '':
+                feat_idx.append(0)
+                feat_value.append(0.0)
+            else:
+                feat_idx.append(self.feat_dict_[idx])
+                feat_value.append(
+                    (float(features[idx]) - self.cont_min_[idx - 1]) /
+                    self.cont_diff_[idx - 1])
+        for idx in self.categorical_range_:
+            if features[idx] == '' or features[idx] not in self.feat_dict_:
+                feat_idx.append(0)
+                feat_value.append(0.0)
+            else:
+                feat_idx.append(self.feat_dict_[features[idx]])
+                feat_value.append(1.0)
+        label = [int(features[0])]
+        return feat_idx, feat_value, label
+    def generate_sample(self, line):
+        """
+        Read the data line by line and process it as a dictionary
+        """
+        def data_iter():
+            feat_idx, feat_value, label = self._process_line(line)
+            s = ""
+            for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
+                      ('label', label)]:
+                k = i[0]
+                v = i[1]
+                for j in v:
+                    s += " " + k + ":" + str(j)
+            print s.strip()
+            yield None
+        return data_iter
+reader = TrainReader("../config.yaml")
+reader.init()
+reader.run_from_stdin()
--- a/models/rank/nfm/data/preprocess.py
+++ b/models/rank/nfm/data/preprocess.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import numpy
+from collections import Counter
+import shutil
+import pickle
+def get_raw_data():
+    if not os.path.isdir('raw_data'):
+        os.mkdir('raw_data')
+    fin = open('train.txt', 'r')
+    fout = open('raw_data/part-0', 'w')
+    for line_idx, line in enumerate(fin):
+        if line_idx % 200000 == 0 and line_idx != 0:
+            fout.close()
+            cur_part_idx = int(line_idx / 200000)
+            fout = open('raw_data/part-' + str(cur_part_idx), 'w')
+        fout.write(line)
+    fout.close()
+    fin.close()
+def split_data():
+    split_rate_ = 0.9
+    dir_train_file_idx_ = 'aid_data/train_file_idx.txt'
+    filelist_ = [
+        'raw_data/part-%d' % x for x in range(len(os.listdir('raw_data')))
+    ]
+    if not os.path.exists(dir_train_file_idx_):
+        train_file_idx = list(
+            numpy.random.choice(
+                len(filelist_), int(len(filelist_) * split_rate_), False))
+        with open(dir_train_file_idx_, 'w') as fout:
+            fout.write(str(train_file_idx))
+    else:
+        with open(dir_train_file_idx_, 'r') as fin:
+            train_file_idx = eval(fin.read())
+    for idx in range(len(filelist_)):
+        if idx in train_file_idx:
+            shutil.move(filelist_[idx], 'train_data')
+        else:
+            shutil.move(filelist_[idx], 'test_data')
+def get_feat_dict():
+    freq_ = 10
+    dir_feat_dict_ = 'aid_data/feat_dict_' + str(freq_) + '.pkl2'
+    continuous_range_ = range(1, 14)
+    categorical_range_ = range(14, 40)
+    if not os.path.exists(dir_feat_dict_):
+        # print('generate a feature dict')
+        # Count the number of occurrences of discrete features
+        feat_cnt = Counter()
+        with open('train.txt', 'r') as fin:
+            for line_idx, line in enumerate(fin):
+                if line_idx % 100000 == 0:
+                    print('generating feature dict', line_idx / 45000000)
+                features = line.rstrip('\n').split('\t')
+                for idx in categorical_range_:
+                    if features[idx] == '': continue
+                    feat_cnt.update([features[idx]])
+        # Only retain discrete features with high frequency 
+        dis_feat_set = set()
+        for feat, ot in feat_cnt.items():
+            if ot >= freq_:
+                dis_feat_set.add(feat)
+        # Create a dictionary for continuous and discrete features
+        feat_dict = {}
+        tc = 1
+        # Continuous features
+        for idx in continuous_range_:
+            feat_dict[idx] = tc
+            tc += 1
+        for feat in dis_feat_set:
+            feat_dict[feat] = tc
+            tc += 1
+        # Save dictionary
+        with open(dir_feat_dict_, 'wb') as fout:
+            pickle.dump(feat_dict, fout, protocol=2)
+        print('args.num_feat ', len(feat_dict) + 1)
+if __name__ == '__main__':
+    if not os.path.isdir('train_data'):
+        os.mkdir('train_data')
+    if not os.path.isdir('test_data'):
+        os.mkdir('test_data')
+    if not os.path.isdir('aid_data'):
+        os.mkdir('aid_data')
+    get_raw_data()
+    split_data()
+    get_feat_dict()
+    print('Done!')
--- a/models/rank/nfm/data/run.sh
+++ b/models/rank/nfm/data/run.sh
+python download_preprocess.py 
+mkdir slot_train_data
+for i in `ls ./train_data`
+do
+    cat train_data/$i | python get_slot_data.py > slot_train_data/$i
+done
+mkdir slot_test_data
+for i in `ls ./test_data`
+do
+    cat test_data/$i | python get_slot_data.py > slot_test_data/$i
+done
--- a/models/rank/nfm/data/sample_data/train/sample_train.txt
+++ b/models/rank/nfm/data/sample_data/train/sample_train.txt
--- a/models/rank/nfm/model.py
+++ b/models/rank/nfm/model.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from collections import OrderedDict
+import paddle.fluid as fluid
+from paddlerec.core.utils import envs
+from paddlerec.core.model import Model as ModelBase
+class Model(ModelBase):
+    def __init__(self, config):
+        ModelBase.__init__(self, config)
+    def _init_hyper_parameters(self):
+        self.is_distributed = True if envs.get_trainer(
+        ) == "CtrTrainer" else False
+        self.sparse_feature_number = envs.get_global_env(
+            "hyper_parameters.sparse_feature_number", None)
+        self.sparse_feature_dim = envs.get_global_env(
+            "hyper_parameters.sparse_feature_dim", None)
+        self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
+                                             False)
+        self.use_batchnorm = envs.get_global_env(
+            "hyper_parameters.use_batchnorm", False)
+        self.use_dropout = envs.get_global_env("hyper_parameters.use_dropout",
+                                               False)
+        self.dropout_prob = envs.get_global_env(
+            "hyper_parameters.dropout_prob", None)
+        self.layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes",
+                                               None)
+        self.loss_type = envs.get_global_env("hyper_parameters.loss_type",
+                                             'logloss')
+        self.reg = envs.get_global_env("hyper_parameters.reg", 1e-4)
+        self.num_field = envs.get_global_env("hyper_parameters.num_field",
+                                             None)
+        self.act = envs.get_global_env("hyper_parameters.act", None)
+    def net(self, inputs, is_infer=False):
+        raw_feat_idx = self._sparse_data_var[1]  # (batch_size * num_field) * 1
+        raw_feat_value = self._dense_data_var[0]  # batch_size * num_field
+        self.label = self._sparse_data_var[0]  # batch_size * 1
+        init_value_ = 0.1
+        feat_idx = raw_feat_idx
+        feat_value = fluid.layers.reshape(
+            raw_feat_value,
+            [-1, self.num_field, 1])  # batch_size * num_field * 1
+        # ------------------------- first order term --------------------------
+        first_weights_re = fluid.embedding(
+            input=feat_idx,
+            is_sparse=self.is_sparse,
+            is_distributed=self.is_distributed,
+            dtype='float32',
+            size=[self.sparse_feature_number + 1, 1],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.TruncatedNormalInitializer(
+                    loc=0.0, scale=init_value_),
+                regularizer=fluid.regularizer.L1DecayRegularizer(self.reg))
+        )  # (batch_size * num_field) * 1 * 1(embedding_size)
+        first_weights = fluid.layers.reshape(
+            first_weights_re,
+            shape=[-1, self.num_field, 1])  # batch_size * num_field * 1
+        y_first_order = fluid.layers.reduce_sum((first_weights * feat_value),
+                                                1)  # batch_size * 1
+        # ------------------------- second order term --------------------------
+        feat_embeddings_re = fluid.embedding(
+            input=feat_idx,
+            is_sparse=self.is_sparse,
+            is_distributed=self.is_distributed,
+            dtype='float32',
+            size=[self.sparse_feature_number + 1, self.sparse_feature_dim],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.TruncatedNormalInitializer(
+                    loc=0.0,
+                    scale=init_value_ /
+                    math.sqrt(float(self.sparse_feature_dim))))
+        )  # (batch_size * num_field) * 1 * embedding_size
+        feat_embeddings = fluid.layers.reshape(
+            feat_embeddings_re,
+            shape=[-1, self.num_field, self.sparse_feature_dim
+                   ])  # batch_size * num_field * embedding_size
+        feat_embeddings = feat_embeddings * feat_value  # batch_size * num_field * embedding_size
+        # sum_square part
+        summed_features_emb = fluid.layers.reduce_sum(
+            feat_embeddings, 1)  # batch_size * embedding_size
+        summed_features_emb_square = fluid.layers.square(
+            summed_features_emb)  # batch_size * embedding_size
+        # square_sum part
+        squared_features_emb = fluid.layers.square(
+            feat_embeddings)  # batch_size * num_field * embedding_size
+        squared_sum_features_emb = fluid.layers.reduce_sum(
+            squared_features_emb, 1)  # batch_size * embedding_size
+        y_FM = 0.5 * (summed_features_emb_square - squared_sum_features_emb
+                      )  # batch_size * embedding_size
+        if self.use_batchnorm:
+            y_FM = fluid.layers.batch_norm(input=y_FM, is_test=is_infer)
+        if self.use_dropout:
+            y_FM = fluid.layers.dropout(
+                x=y_FM, dropout_prob=self.dropout_prob, is_test=is_infer)
+        # ------------------------- DNN --------------------------
+        y_dnn = y_FM
+        for s in self.layer_sizes:
+            if self.use_batchnorm:
+                y_dnn = fluid.layers.fc(
+                    input=y_dnn,
+                    size=s,
+                    act=self.act,
+                    param_attr=fluid.ParamAttr(initializer=fluid.initializer.
+                                               TruncatedNormalInitializer(
+                                                   loc=0.0,
+                                                   scale=init_value_ /
+                                                   math.sqrt(float(10)))),
+                    bias_attr=fluid.ParamAttr(initializer=fluid.initializer.
+                                              TruncatedNormalInitializer(
+                                                  loc=0.0, scale=init_value_)))
+                y_dnn = fluid.layers.batch_norm(
+                    input=y_dnn, act=self.act, is_test=is_infer)
+            else:
+                y_dnn = fluid.layers.fc(
+                    input=y_dnn,
+                    size=s,
+                    act=self.act,
+                    param_attr=fluid.ParamAttr(initializer=fluid.initializer.
+                                               TruncatedNormalInitializer(
+                                                   loc=0.0,
+                                                   scale=init_value_ /
+                                                   math.sqrt(float(10)))),
+                    bias_attr=fluid.ParamAttr(initializer=fluid.initializer.
+                                              TruncatedNormalInitializer(
+                                                  loc=0.0, scale=init_value_)))
+            if self.use_dropout:
+                y_dnn = fluid.layers.dropout(
+                    x=y_dnn, dropout_prob=self.dropout_prob, is_test=is_infer)
+        y_dnn = fluid.layers.fc(
+            input=y_dnn,
+            size=1,
+            act=None,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.TruncatedNormalInitializer(
+                    loc=0.0, scale=init_value_)),
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.TruncatedNormalInitializer(
+                    loc=0.0, scale=init_value_)))
+        # ------------------------- Predict --------------------------
+        self.predict = fluid.layers.sigmoid(y_first_order + y_dnn)
+        if self.loss_type == "squqre_loss":
+            cost = fluid.layers.mse_loss(
+                input=self.predict,
+                label=fluid.layers.cast(self.label, "float32"))
+        else:
+            cost = fluid.layers.log_loss(
+                input=self.predict,
+                label=fluid.layers.cast(self.label,
+                                        "float32"))  # default log_loss
+        avg_cost = fluid.layers.reduce_sum(cost)
+        self._cost = avg_cost
+        predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1)
+        label_int = fluid.layers.cast(self.label, 'int64')
+        auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d,
+                                                     label=label_int,
+                                                     slide_steps=0)
+        self._metrics["AUC"] = auc_var
+        self._metrics["BATCH_AUC"] = batch_auc_var
+        if is_infer:
+            self._infer_results["AUC"] = auc_var