提交 da1c712d 编写于 作者: M malin10

add linear regression

上级 b8e17866
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# global settings
debug: false
workspace: "/home/aistudio/PaddleRec-master/models/rank/linear_regression"
dataset:
- name: dataset_train
type: QueueDataset
batch_size: 1
data_path: "{workspace}/data/train_data/"
sparse_slots: "userid gender age occupation movieid title genres"
dense_slots: "label:1"
- name: dataset_infer
type: QueueDataset
batch_size: 1
data_path: "{workspace}/data/test_data/"
sparse_slots: "userid gender age occupation movieid title genres"
dense_slots: "label:1"
hyper_parameters:
optimizer:
class: SGD
learning_rate: 0.0001
sparse_feature_number: 1000000
sparse_feature_dim: 1
reg: 0.001
mode: train_runner
# if infer, change mode to "infer_runner" and change phase to "infer_phase"
runner:
- name: train_runner
class: train
epochs: 1
device: cpu
init_model_path: ""
save_checkpoint_interval: 1
save_inference_interval: 1
save_checkpoint_path: "increment"
save_inference_path: "inference"
print_interval: 100
- name: infer_runner
class: infer
device: cpu
init_model_path: "increment/0"
print_interval: 1
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: dataset_train
thread_num: 12
#- name: infer_phase
# model: "{workspace}/model.py"
# dataset_name: infer_sample
# thread_num: 1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import sys
LOCAL_PATH = os.path.dirname(os.path.abspath(__file__))
TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools")
sys.path.append(TOOLS_PATH)
from paddlerec.tools.tools import download_file_and_uncompress, download_file
if __name__ == '__main__':
url = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
print("download and extract starting...")
download_file_and_uncompress(url)
print("download and extract finished")
# print("preprocessing...")
# os.system("python preprocess.py")
# print("preprocess done")
# shutil.rmtree("raw_data")
print("done")
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#coding=utf8
import os
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import random
import json
user_fea = ["userid", "gender", "age", "occupation"]
movie_fea = ["movieid", "title", "genres"]
rating_fea = ["userid", "movieid", "rating", "time"]
dict_size = 1000000
hash_dict = dict()
data_path = "ml-1m"
test_user_path = "online_user"
def process(path, output_path):
user_dict = parse_data(data_path + "/users.dat", user_fea)
movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea)
res = []
for line in open(path):
line = line.strip()
arr = line.split("::")
userid = arr[0]
movieid = arr[1]
out_str = "time:%s\t%s\t%s\tlabel:%s" % (arr[3], user_dict[userid],
movie_dict[movieid], arr[2])
log_id = hash(out_str) % 1000000000
res.append("%s\t%s" % (log_id, out_str))
with open(output_path, 'w') as fout:
for line in res:
fout.write(line)
fout.write("\n")
def parse_data(file_name, feas):
dict = {}
for line in open(file_name):
line = line.strip()
arr = line.split("::")
out_str = ""
for i in range(0, len(feas)):
out_str += "%s:%s\t" % (feas[i], arr[i])
dict[arr[0]] = out_str.strip()
return dict
def parse_movie_data(file_name, feas):
dict = {}
for line in open(file_name):
line = line.strip()
arr = line.split("::")
title_str = ""
genres_str = ""
for term in arr[1].split(" "):
term = term.strip()
if term != "":
title_str += "%s " % (term)
for term in arr[2].split("|"):
term = term.strip()
if term != "":
genres_str += "%s " % (term)
out_str = "movieid:%s\ttitle:%s\tgenres:%s" % (
arr[0], title_str.strip(), genres_str.strip())
dict[arr[0]] = out_str.strip()
return dict
def to_hash(in_str):
feas = in_str.split(":")[0]
arr = in_str.split(":")[1]
out_str = "%s:%s" % (feas, (arr + arr[::-1] + arr[::-2] + arr[::-3]))
hash_id = hash(out_str) % dict_size
# if hash_id in hash_dict and hash_dict[hash_id] != out_str:
# print(hash_id, out_str, hash(out_str))
# print("conflict")
# exit(-1)
return "%s:%s" % (feas, hash_id)
def to_hash_list(in_str):
arr = in_str.split(":")
tmp_arr = arr[1].split(" ")
out_str = ""
for item in tmp_arr:
item = item.strip()
if item != "":
key = "%s:%s" % (arr[0], item)
out_str += "%s " % (to_hash(key))
return out_str.strip()
def get_hash(path):
#0-34831 1-time:974673057 2-userid:2021 3-gender:M 4-age:25 5-occupation:0 6-movieid:1345 7-title:Carrie (1976) 8-genres:Horror 9-label:2
for line in open(path):
arr = line.strip().split("\t")
out_str = "logid:%s %s %s %s %s %s %s %s %s %s" % \
(arr[0], arr[1], to_hash(arr[2]), to_hash(arr[3]), to_hash(arr[4]), to_hash(arr[5]), \
to_hash(arr[6]), to_hash_list(arr[7]), to_hash_list(arr[8]), arr[9])
print out_str
def split(path, output_dir, num=24):
contents = []
with open(path) as f:
contents = f.readlines()
lines_per_file = len(contents) / num
print("contents: ", str(len(contents)))
print("lines_per_file: ", str(lines_per_file))
for i in range(1, num + 1):
with open(os.path.join(output_dir, "part_" + str(i)), 'w') as fout:
data = contents[(i - 1) * lines_per_file:min(i * lines_per_file,
len(contents))]
for line in data:
fout.write(line)
if __name__ == "__main__":
random.seed(1111111)
if sys.argv[1] == "process_raw":
process(sys.argv[2], sys.argv[3])
elif sys.argv[1] == "hash":
get_hash(sys.argv[2])
elif sys.argv[1] == "split":
split(sys.argv[2], sys.argv[3])
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
train = dict()
test = dict()
data_path = "ml-1m"
for line in open(data_path + "/ratings.dat"):
fea = line.rstrip().split("::")
if fea[0] not in train:
train[fea[0]] = [line]
elif fea[0] not in test:
test[fea[0]] = dict()
test[fea[0]]['time'] = int(fea[3])
test[fea[0]]['content'] = line
else:
time = int(fea[3])
if time <= test[fea[0]]['time']:
train[fea[0]].append(line)
else:
train[fea[0]].append(test[fea[0]]['content'])
test[fea[0]]['time'] = time
test[fea[0]]['content'] = line
train_data = []
for key in train:
for line in train[key]:
train_data.append(line)
random.shuffle(train_data)
train_num = 10000
idx = 0
with open(data_path + "/train.dat", 'w') as f:
for line in train_data:
idx += 1
if idx > train_num:
break
f.write(line)
with open(data_path + "/test.dat", 'w') as f:
for key in test:
f.write(test[key]['content'])
cd data
# 1. download data
python download_preprocess.py
# 2. split data
python split.py
# 3. 数据拼接
python preprocess.py process_raw ml-1m/train.dat raw_train
python preprocess.py process_raw ml-1m/test.dat raw_test
# 4. hash
python preprocess.py hash raw_train > train_data/data
python preprocess.py hash raw_test > test_data/data
cd ..
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.model import ModelBase
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def _init_hyper_parameters(self):
self.sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None)
self.reg = envs.get_global_env("hyper_parameters.reg", 1e-4)
def net(self, inputs, is_infer=False):
init_value_ = 0.1
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
# ------------------------- network input --------------------------
sparse_var = self._sparse_data_var
self.label = self._dense_data_var[0]
def embedding_layer(input):
emb = fluid.embedding(
input=input,
is_sparse=True,
is_distributed=is_distributed,
size=[self.sparse_feature_number + 1, 1],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormalInitializer(
loc=0.0, scale=init_value_),
regularizer=fluid.regularizer.L1DecayRegularizer(
self.reg)))
reshape_emb = fluid.layers.reshape(emb, shape=[-1, 1])
return reshape_emb
sparse_embed_seq = list(map(embedding_layer, sparse_var))
weight = fluid.layers.concat(sparse_embed_seq, axis=0)
weight_sum = fluid.layers.reduce_sum(weight)
b_linear = fluid.layers.create_parameter(
shape=[1],
dtype='float32',
default_initializer=fluid.initializer.ConstantInitializer(value=0))
self.predict = fluid.layers.relu(weight_sum + b_linear)
cost = fluid.layers.square_error_cost(
input=self.predict, label=self.label)
avg_cost = fluid.layers.reduce_sum(cost)
self._cost = avg_cost
self._metrics["COST"] = self._cost
self._metrics["Predict"] = self.predict
if is_infer:
self._infer_results["Predict"] = self.predict
self._infer_results["COST"] = self._cost
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import sys
params = []
with open(sys.argv[1]) as f:
for line in f:
line = line.strip().strip('data: ').strip(',').split(',')
line = map(float, line)
params.append(line)
feas = []
with open(sys.argv[2]) as f:
for line in f:
line = line.strip().split('\t')
feas.append(line)
score = []
with open(sys.argv[3]) as f:
for line in f:
line = float(line.strip().strip('data: ').strip()[1:-1])
score.append(line)
assert (len(params) == len(feas))
length = len(params)
bias = None
for i in range(length):
label = feas[i][-1]
tmp = feas[i][2:-3]
tmp_fea = feas[i][-3].split(":")
_ = tmp_fea[1].split(" ")
for j in range(len(_)):
if _[j] != "":
tmp.append(tmp_fea[0] + ":" + _[j])
tmp_fea = feas[i][-2].split(":")
_ = tmp_fea[1].split(" ")
for j in range(len(_)):
if _[j] != "":
tmp.append(tmp_fea[0] + ":" + _[j])
sort_p = np.argsort(np.array(params[i]))[::-1]
res = []
for j in range(len(sort_p)):
res.append(tmp[sort_p[j]] + "_" + str(params[i][sort_p[j]]))
res.append(label)
res.append(str(score[i]))
bias = score[i] - sum(params[i])
print("; ".join(res))
assert (len(params[i]) == len(tmp))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册