From da1c712de3c7fc6e5a525ec8d512365effdd80cc Mon Sep 17 00:00:00 2001 From: malin10 Date: Thu, 9 Jul 2020 23:20:40 +0800 Subject: [PATCH] add linear regression --- models/rank/linear_regression/__init__.py | 13 ++ models/rank/linear_regression/config.yaml | 72 +++++++++ .../data/download_preprocess.py | 37 +++++ .../rank/linear_regression/data/preprocess.py | 146 ++++++++++++++++++ models/rank/linear_regression/data/split.py | 56 +++++++ .../linear_regression/data/test_data/data | 0 .../linear_regression/data/train_data/data | 0 models/rank/linear_regression/data_prepare.sh | 15 ++ models/rank/linear_regression/model.py | 75 +++++++++ models/rank/linear_regression/parse_param.py | 64 ++++++++ 10 files changed, 478 insertions(+) create mode 100644 models/rank/linear_regression/__init__.py create mode 100644 models/rank/linear_regression/config.yaml create mode 100644 models/rank/linear_regression/data/download_preprocess.py create mode 100644 models/rank/linear_regression/data/preprocess.py create mode 100644 models/rank/linear_regression/data/split.py create mode 100644 models/rank/linear_regression/data/test_data/data create mode 100644 models/rank/linear_regression/data/train_data/data create mode 100644 models/rank/linear_regression/data_prepare.sh create mode 100644 models/rank/linear_regression/model.py create mode 100644 models/rank/linear_regression/parse_param.py diff --git a/models/rank/linear_regression/__init__.py b/models/rank/linear_regression/__init__.py new file mode 100644 index 00000000..abf198b9 --- /dev/null +++ b/models/rank/linear_regression/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/rank/linear_regression/config.yaml b/models/rank/linear_regression/config.yaml new file mode 100644 index 00000000..29e81409 --- /dev/null +++ b/models/rank/linear_regression/config.yaml @@ -0,0 +1,72 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# global settings +debug: false +workspace: "/home/aistudio/PaddleRec-master/models/rank/linear_regression" + + +dataset: + - name: dataset_train + type: QueueDataset + batch_size: 1 + data_path: "{workspace}/data/train_data/" + sparse_slots: "userid gender age occupation movieid title genres" + dense_slots: "label:1" + - name: dataset_infer + type: QueueDataset + batch_size: 1 + data_path: "{workspace}/data/test_data/" + sparse_slots: "userid gender age occupation movieid title genres" + dense_slots: "label:1" + +hyper_parameters: + optimizer: + class: SGD + learning_rate: 0.0001 + sparse_feature_number: 1000000 + sparse_feature_dim: 1 + reg: 0.001 + + +mode: train_runner +# if infer, change mode to "infer_runner" and change phase to "infer_phase" + +runner: + - name: train_runner + class: train + epochs: 1 + device: cpu + init_model_path: "" + save_checkpoint_interval: 1 + save_inference_interval: 1 + save_checkpoint_path: "increment" + save_inference_path: "inference" + print_interval: 100 + - name: infer_runner + class: infer + device: cpu + init_model_path: "increment/0" + print_interval: 1 + + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataset_train + thread_num: 12 +#- name: infer_phase +# model: "{workspace}/model.py" +# dataset_name: infer_sample +# thread_num: 1 diff --git a/models/rank/linear_regression/data/download_preprocess.py b/models/rank/linear_regression/data/download_preprocess.py new file mode 100644 index 00000000..ab2f7cc6 --- /dev/null +++ b/models/rank/linear_regression/data/download_preprocess.py @@ -0,0 +1,37 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import sys + +LOCAL_PATH = os.path.dirname(os.path.abspath(__file__)) +TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools") +sys.path.append(TOOLS_PATH) + +from paddlerec.tools.tools import download_file_and_uncompress, download_file + +if __name__ == '__main__': + url = "http://files.grouplens.org/datasets/movielens/ml-1m.zip" + + print("download and extract starting...") + download_file_and_uncompress(url) + print("download and extract finished") + + # print("preprocessing...") + # os.system("python preprocess.py") + # print("preprocess done") + + # shutil.rmtree("raw_data") + print("done") diff --git a/models/rank/linear_regression/data/preprocess.py b/models/rank/linear_regression/data/preprocess.py new file mode 100644 index 00000000..7392deca --- /dev/null +++ b/models/rank/linear_regression/data/preprocess.py @@ -0,0 +1,146 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf8 +import os +import sys +reload(sys) +sys.setdefaultencoding('utf-8') +import random +import json + +user_fea = ["userid", "gender", "age", "occupation"] +movie_fea = ["movieid", "title", "genres"] +rating_fea = ["userid", "movieid", "rating", "time"] +dict_size = 1000000 +hash_dict = dict() + +data_path = "ml-1m" +test_user_path = "online_user" + + +def process(path, output_path): + user_dict = parse_data(data_path + "/users.dat", user_fea) + movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea) + + res = [] + for line in open(path): + line = line.strip() + arr = line.split("::") + userid = arr[0] + movieid = arr[1] + out_str = "time:%s\t%s\t%s\tlabel:%s" % (arr[3], user_dict[userid], + movie_dict[movieid], arr[2]) + log_id = hash(out_str) % 1000000000 + res.append("%s\t%s" % (log_id, out_str)) + with open(output_path, 'w') as fout: + for line in res: + fout.write(line) + fout.write("\n") + + +def parse_data(file_name, feas): + dict = {} + for line in open(file_name): + line = line.strip() + arr = line.split("::") + out_str = "" + for i in range(0, len(feas)): + out_str += "%s:%s\t" % (feas[i], arr[i]) + + dict[arr[0]] = out_str.strip() + return dict + + +def parse_movie_data(file_name, feas): + dict = {} + for line in open(file_name): + line = line.strip() + arr = line.split("::") + title_str = "" + genres_str = "" + + for term in arr[1].split(" "): + term = term.strip() + if term != "": + title_str += "%s " % (term) + for term in arr[2].split("|"): + term = term.strip() + if term != "": + genres_str += "%s " % (term) + out_str = "movieid:%s\ttitle:%s\tgenres:%s" % ( + arr[0], title_str.strip(), genres_str.strip()) + dict[arr[0]] = out_str.strip() + return dict + + +def to_hash(in_str): + feas = in_str.split(":")[0] + arr = in_str.split(":")[1] + out_str = "%s:%s" % (feas, (arr + arr[::-1] + arr[::-2] + arr[::-3])) + hash_id = hash(out_str) % dict_size + # if hash_id in hash_dict and hash_dict[hash_id] != out_str: + # print(hash_id, out_str, hash(out_str)) + # print("conflict") + # exit(-1) + + return "%s:%s" % (feas, hash_id) + + +def to_hash_list(in_str): + arr = in_str.split(":") + tmp_arr = arr[1].split(" ") + out_str = "" + for item in tmp_arr: + item = item.strip() + if item != "": + key = "%s:%s" % (arr[0], item) + out_str += "%s " % (to_hash(key)) + return out_str.strip() + + +def get_hash(path): + #0-34831 1-time:974673057 2-userid:2021 3-gender:M 4-age:25 5-occupation:0 6-movieid:1345 7-title:Carrie (1976) 8-genres:Horror 9-label:2 + for line in open(path): + arr = line.strip().split("\t") + out_str = "logid:%s %s %s %s %s %s %s %s %s %s" % \ + (arr[0], arr[1], to_hash(arr[2]), to_hash(arr[3]), to_hash(arr[4]), to_hash(arr[5]), \ + to_hash(arr[6]), to_hash_list(arr[7]), to_hash_list(arr[8]), arr[9]) + print out_str + + +def split(path, output_dir, num=24): + contents = [] + with open(path) as f: + contents = f.readlines() + lines_per_file = len(contents) / num + print("contents: ", str(len(contents))) + print("lines_per_file: ", str(lines_per_file)) + + for i in range(1, num + 1): + with open(os.path.join(output_dir, "part_" + str(i)), 'w') as fout: + data = contents[(i - 1) * lines_per_file:min(i * lines_per_file, + len(contents))] + for line in data: + fout.write(line) + + +if __name__ == "__main__": + random.seed(1111111) + if sys.argv[1] == "process_raw": + process(sys.argv[2], sys.argv[3]) + elif sys.argv[1] == "hash": + get_hash(sys.argv[2]) + elif sys.argv[1] == "split": + split(sys.argv[2], sys.argv[3]) diff --git a/models/rank/linear_regression/data/split.py b/models/rank/linear_regression/data/split.py new file mode 100644 index 00000000..c763faf3 --- /dev/null +++ b/models/rank/linear_regression/data/split.py @@ -0,0 +1,56 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random + +train = dict() +test = dict() +data_path = "ml-1m" + +for line in open(data_path + "/ratings.dat"): + fea = line.rstrip().split("::") + if fea[0] not in train: + train[fea[0]] = [line] + elif fea[0] not in test: + test[fea[0]] = dict() + test[fea[0]]['time'] = int(fea[3]) + test[fea[0]]['content'] = line + else: + time = int(fea[3]) + if time <= test[fea[0]]['time']: + train[fea[0]].append(line) + else: + train[fea[0]].append(test[fea[0]]['content']) + test[fea[0]]['time'] = time + test[fea[0]]['content'] = line + +train_data = [] +for key in train: + for line in train[key]: + train_data.append(line) + +random.shuffle(train_data) +train_num = 10000 +idx = 0 + +with open(data_path + "/train.dat", 'w') as f: + for line in train_data: + idx += 1 + if idx > train_num: + break + f.write(line) + +with open(data_path + "/test.dat", 'w') as f: + for key in test: + f.write(test[key]['content']) diff --git a/models/rank/linear_regression/data/test_data/data b/models/rank/linear_regression/data/test_data/data new file mode 100644 index 00000000..e69de29b diff --git a/models/rank/linear_regression/data/train_data/data b/models/rank/linear_regression/data/train_data/data new file mode 100644 index 00000000..e69de29b diff --git a/models/rank/linear_regression/data_prepare.sh b/models/rank/linear_regression/data_prepare.sh new file mode 100644 index 00000000..6e9f9877 --- /dev/null +++ b/models/rank/linear_regression/data_prepare.sh @@ -0,0 +1,15 @@ +cd data +# 1. download data +python download_preprocess.py + +# 2. split data +python split.py + +# 3. 数据拼接 +python preprocess.py process_raw ml-1m/train.dat raw_train +python preprocess.py process_raw ml-1m/test.dat raw_test + +# 4. hash +python preprocess.py hash raw_train > train_data/data +python preprocess.py hash raw_test > test_data/data +cd .. diff --git a/models/rank/linear_regression/model.py b/models/rank/linear_regression/model.py new file mode 100644 index 00000000..1680092a --- /dev/null +++ b/models/rank/linear_regression/model.py @@ -0,0 +1,75 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle.fluid as fluid + +from paddlerec.core.utils import envs +from paddlerec.core.model import ModelBase + + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + + def _init_hyper_parameters(self): + self.sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number", None) + self.reg = envs.get_global_env("hyper_parameters.reg", 1e-4) + + def net(self, inputs, is_infer=False): + init_value_ = 0.1 + is_distributed = True if envs.get_trainer() == "CtrTrainer" else False + + # ------------------------- network input -------------------------- + + sparse_var = self._sparse_data_var + self.label = self._dense_data_var[0] + + def embedding_layer(input): + emb = fluid.embedding( + input=input, + is_sparse=True, + is_distributed=is_distributed, + size=[self.sparse_feature_number + 1, 1], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormalInitializer( + loc=0.0, scale=init_value_), + regularizer=fluid.regularizer.L1DecayRegularizer( + self.reg))) + reshape_emb = fluid.layers.reshape(emb, shape=[-1, 1]) + return reshape_emb + + sparse_embed_seq = list(map(embedding_layer, sparse_var)) + weight = fluid.layers.concat(sparse_embed_seq, axis=0) + weight_sum = fluid.layers.reduce_sum(weight) + b_linear = fluid.layers.create_parameter( + shape=[1], + dtype='float32', + default_initializer=fluid.initializer.ConstantInitializer(value=0)) + + self.predict = fluid.layers.relu(weight_sum + b_linear) + cost = fluid.layers.square_error_cost( + input=self.predict, label=self.label) + avg_cost = fluid.layers.reduce_sum(cost) + + self._cost = avg_cost + + self._metrics["COST"] = self._cost + self._metrics["Predict"] = self.predict + if is_infer: + self._infer_results["Predict"] = self.predict + self._infer_results["COST"] = self._cost diff --git a/models/rank/linear_regression/parse_param.py b/models/rank/linear_regression/parse_param.py new file mode 100644 index 00000000..dba6706b --- /dev/null +++ b/models/rank/linear_regression/parse_param.py @@ -0,0 +1,64 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import sys + +params = [] +with open(sys.argv[1]) as f: + for line in f: + line = line.strip().strip('data: ').strip(',').split(',') + line = map(float, line) + params.append(line) + +feas = [] +with open(sys.argv[2]) as f: + for line in f: + line = line.strip().split('\t') + feas.append(line) + +score = [] +with open(sys.argv[3]) as f: + for line in f: + line = float(line.strip().strip('data: ').strip()[1:-1]) + score.append(line) + +assert (len(params) == len(feas)) +length = len(params) + +bias = None +for i in range(length): + label = feas[i][-1] + tmp = feas[i][2:-3] + tmp_fea = feas[i][-3].split(":") + _ = tmp_fea[1].split(" ") + for j in range(len(_)): + if _[j] != "": + tmp.append(tmp_fea[0] + ":" + _[j]) + tmp_fea = feas[i][-2].split(":") + _ = tmp_fea[1].split(" ") + for j in range(len(_)): + if _[j] != "": + tmp.append(tmp_fea[0] + ":" + _[j]) + sort_p = np.argsort(np.array(params[i]))[::-1] + + res = [] + for j in range(len(sort_p)): + res.append(tmp[sort_p[j]] + "_" + str(params[i][sort_p[j]])) + + res.append(label) + res.append(str(score[i])) + bias = score[i] - sum(params[i]) + print("; ".join(res)) + assert (len(params[i]) == len(tmp)) -- GitLab