提交 2eba5d90 编写于 作者: M malin10

add movie_recommand_demo

上级 dfb74b32
......@@ -20,6 +20,8 @@ from __future__ import print_function
import time
import logging
import os
import json
import numpy as np
import paddle.fluid as fluid
from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer
......@@ -263,8 +265,10 @@ class SingleInfer(TranspileTrainer):
envs.get_global_env("runner." + self._runner_name +
".print_interval", 20))
metrics_format.append("{}: {{}}".format("batch"))
metrics_indexes = dict()
for name, var in metrics.items():
metrics_varnames.append(var.name)
metrics_indexes[var.name] = len(metrics_varnames) - 1
metrics_format.append("{}: {{}}".format(name))
metrics_format = ", ".join(metrics_format)
......@@ -272,19 +276,30 @@ class SingleInfer(TranspileTrainer):
reader.start()
batch_id = 0
scope = self._model[model_name][2]
infer_results = []
with fluid.scope_guard(scope):
try:
while True:
metrics_rets = self._exe.run(program=program,
fetch_list=metrics_varnames)
fetch_list=metrics_varnames,
return_numpy=False)
metrics = [batch_id]
metrics.extend(metrics_rets)
batch_infer_result = {}
for k, v in metrics_indexes.items():
batch_infer_result[k] = np.array(metrics_rets[
v]).tolist()
infer_results.append(batch_infer_result)
if batch_id % fetch_period == 0 and batch_id != 0:
print(metrics_format.format(*metrics))
batch_id += 1
except fluid.core.EOFException:
reader.reset()
with open(model_dict['save_path'], 'w') as fout:
json.dump(infer_results, fout)
def terminal(self, context):
context['is_exit'] = True
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#coding=utf8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import random
import json
user_fea = ["userid", "gender", "age", "occupation"]
movie_fea = ["movieid", "title", "genres"]
rating_fea = ["userid", "movieid", "rating", "time"]
dict_size = 60000000
hash_dict = dict()
data_path = "ml-1m"
test_user_path = "online_user"
def process(path):
user_dict = parse_data(data_path + "/users.dat", user_fea)
movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea)
for line in open(path):
line = line.strip()
arr = line.split("::")
userid = arr[0]
movieid = arr[1]
out_str = "time:%s\t%s\t%s\tlabel:%s" % (arr[3], user_dict[userid],
movie_dict[movieid], arr[2])
log_id = hash(out_str) % 1000000000
print "%s\t%s" % (log_id, out_str)
def parse_data(file_name, feas):
dict = {}
for line in open(file_name):
line = line.strip()
arr = line.split("::")
out_str = ""
for i in range(0, len(feas)):
out_str += "%s:%s\t" % (feas[i], arr[i])
dict[arr[0]] = out_str.strip()
return dict
def parse_movie_data(file_name, feas):
dict = {}
for line in open(file_name):
line = line.strip()
arr = line.split("::")
title_str = ""
genres_str = ""
for term in arr[1].split(" "):
term = term.strip()
if term != "":
title_str += "%s " % (term)
for term in arr[2].split("|"):
term = term.strip()
if term != "":
genres_str += "%s " % (term)
out_str = "movieid:%s\ttitle:%s\tgenres:%s" % (
arr[0], title_str.strip(), genres_str.strip())
dict[arr[0]] = out_str.strip()
return dict
def to_hash(in_str):
feas = in_str.split(":")[0]
arr = in_str.split(":")[1]
out_str = "%s:%s" % (feas, (arr + arr[::-1] + arr[::-2] + arr[::-3]))
hash_id = hash(out_str) % dict_size
if hash_id in hash_dict and hash_dict[hash_id] != out_str:
print(hash_id, out_str, hash(out_str))
print("conflict")
exit(-1)
return "%s:%s" % (feas, hash_id)
def to_hash_list(in_str):
arr = in_str.split(":")
tmp_arr = arr[1].split(" ")
out_str = ""
for item in tmp_arr:
item = item.strip()
if item != "":
key = "%s:%s" % (arr[0], item)
out_str += "%s " % (to_hash(key))
return out_str.strip()
def get_hash(path):
#0-34831 1-time:974673057 2-userid:2021 3-gender:M 4-age:25 5-occupation:0 6-movieid:1345 7-title:Carrie (1976) 8-genres:Horror 9-label:2
for line in open(path):
arr = line.strip().split("\t")
out_str = "logid:%s %s %s %s %s %s %s %s %s %s" % \
(arr[0], arr[1], to_hash(arr[2]), to_hash(arr[3]), to_hash(arr[4]), to_hash(arr[5]), \
to_hash(arr[6]), to_hash_list(arr[7]), to_hash_list(arr[8]), arr[9])
print out_str
def generate_online_user():
movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea)
with open(test_user_path + "/movies.dat", 'w') as f:
for line in open(test_user_path + "/users.dat"):
line = line.strip()
arr = line.split("::")
userid = arr[0]
for item in movie_dict:
f.write(userid + "::" + item + "::1")
f.write("\n")
def generate_online_data(path):
user_dict = parse_data(data_path + "/users.dat", user_fea)
movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea)
for line in open(path):
line = line.strip()
arr = line.split("::")
userid = arr[0]
movieid = arr[1]
label = arr[2]
out_str = "time:%s\t%s\t%s\tlabel:%s" % ("1", user_dict[userid],
movie_dict[movieid], label)
log_id = hash(out_str) % 1000000000
res = "%s\t%s" % (log_id, out_str)
arr = res.strip().split("\t")
out_str = "logid:%s %s %s %s %s %s %s %s %s %s" % \
(arr[0], arr[1], to_hash(arr[2]), to_hash(arr[3]), to_hash(arr[4]), to_hash(arr[5]), \
to_hash(arr[6]), to_hash_list(arr[7]), to_hash_list(arr[8]), arr[9])
print(out_str)
if __name__ == "__main__":
random.seed(1111111)
if sys.argv[1] == "process_raw":
process(sys.argv[2])
elif sys.argv[1] == "hash":
get_hash(sys.argv[2])
elif sys.argv[1] == "data_recall":
generate_online_user()
generate_online_data(test_user_path + "/movies.dat")
elif sys.argv[1] == "data_rank":
generate_online_data(test_user_path + "/movies.dat")
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
train = dict()
test = dict()
data_path = "ml-1m"
for line in open(data_path + "/ratings.dat"):
fea = line.rstrip().split("::")
if fea[0] not in train:
train[fea[0]] = [line]
elif fea[0] not in test:
test[fea[0]] = dict()
test[fea[0]]['time'] = int(fea[3])
test[fea[0]]['content'] = line
else:
time = int(fea[3])
if time <= test[fea[0]]['time']:
train[fea[0]].append(line)
else:
train[fea[0]].append(test[fea[0]]['content'])
test[fea[0]]['time'] = time
test[fea[0]]['content'] = line
train_data = []
for key in train:
for line in train[key]:
train_data.append(line)
random.shuffle(train_data)
with open(data_path + "/train.dat", 'w') as f:
for line in train_data:
f.write(line)
with open(data_path + "/test.dat", 'w') as f:
for key in test:
f.write(test[key]['content'])
cd data
wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
unzip ml-1m.zip
python split.py
mkdir train/
mkdir test/
python process_ml_1m.py process_raw ./ml-1m/train.dat | sort -t $'\t' -k 9 -n > log.data.train
python process_ml_1m.py process_raw ./ml-1m/test.dat | sort -t $'\t' -k 9 -n > log.data.test
python process_ml_1m.py hash log.data.train > ./train/log.data.hash
python process_ml_1m.py hash log.data.test > ./test/log.data.hash
rm log.data.train
rm log.data.test
cd ../
## modify config.yaml to infer mode at first
cd recall
python -m paddlerec.run -m ./config.yaml
cd ../rank
python -m paddlerec.run -m ./config.yaml
cd ..
echo "recall offline test result:"
python parse.py recall_offline recall/infer_result
echo "rank offline test result:"
python parse.py recall_offline rank/infer_result
cd data
python process_ml_1m.py data_rank > online_user/test/data.txt
## modify recall/config.yaml to online_infer mode
cd ../rank
python -m paddlerec.run -m ./config.yaml
cd ../
python parse.py rank_online rank/infer_result
cd data
mkdir online_user/test
python process_ml_1m.py data_recall > online_user/test/data.txt
## modify recall/config.yaml to online_infer mode
cd ../recall
python -m paddlerec.run -m ./config.yaml
cd ../
python parse.py recall_online recall/infer_result
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "demo/movie_recommand"
# list of dataset
dataset:
- name: dataset_train # name of dataset to distinguish different datasets
batch_size: 128
type: QueueDataset
data_path: "{workspace}/data/train"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
- name: dataset_infer # name
batch_size: 128
type: DataLoader
data_path: "{workspace}/data/test"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
- name: dataset_online_infer # name
batch_size: 10
type: DataLoader
data_path: "{workspace}/data/online_user/test"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
# hyper parameters of user-defined network
hyper_parameters:
# optimizer config
optimizer:
class: Adam
learning_rate: 0.001
strategy: async
# user-defined <key, value> pairs
sparse_feature_number: 60000000
sparse_feature_dim: 9
dense_input_dim: 13
fc_sizes: [512, 256, 128, 32]
# train
mode: runner_train
## online or offline infer
#mode: runner_infer
runner:
- name: runner_train
class: single_train
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
epochs: 10
device: cpu
- name: runner_infer
epochs: 1
class: single_infer
print_interval: 10000
init_model_path: "increment/9" # load model path
#train
phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 12
##offline infer
#phase:
#- name: phase1
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# save_path: "./infer_result"
# thread_num: 1
##offline infer
#phase:
#- name: phase1
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_online_infer # select dataset by name
# save_path: "./infer_result"
# thread_num: 1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def _init_hyper_parameters(self):
self.is_distributed = True if envs.get_trainer(
) == "CtrTrainer" else False
self.sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number")
self.sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.learning_rate = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")
self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes")
def net(self, input, is_infer=False):
self.user_sparse_inputs = self._sparse_data_var[2:6]
self.mov_sparse_inputs = self._sparse_data_var[6:9]
self.label_input = self._sparse_data_var[-1]
def fc(input):
fcs = [input]
for size in self.hidden_layers:
output = fluid.layers.fc(
input=fcs[-1],
size=size,
act='relu',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Normal(
scale=1.0 / math.sqrt(fcs[-1].shape[1]))))
fcs.append(output)
return fcs[-1]
def embedding_layer(input):
emb = fluid.layers.embedding(
input=input,
is_sparse=True,
is_distributed=self.is_distributed,
size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=fluid.ParamAttr(
name="emb", initializer=fluid.initializer.Uniform()), )
emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum')
return emb_sum
user_sparse_embed_seq = list(
map(embedding_layer, self.user_sparse_inputs))
mov_sparse_embed_seq = list(
map(embedding_layer, self.mov_sparse_inputs))
concated_user = fluid.layers.concat(user_sparse_embed_seq, axis=1)
concated_mov = fluid.layers.concat(mov_sparse_embed_seq, axis=1)
usr_combined_features = fc(concated_user)
mov_combined_features = fc(concated_mov)
fc_input = fluid.layers.concat(
[usr_combined_features, mov_combined_features], axis=1)
sim = fluid.layers.fc(
input=fc_input,
size=1,
act='sigmoid',
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1.0 / math.sqrt(fc_input.shape[1]))))
predict = fluid.layers.scale(sim, scale=5)
self.predict = predict
#auc, batch_auc, _ = fluid.layers.auc(input=self.predict,
# label=self.label_input,
# num_thresholds=10000,
# slide_steps=20)
if is_infer:
self._infer_results["user_feature"] = usr_combined_features
self._infer_results["movie_feature"] = mov_combined_features
self._infer_results["uid"] = self._sparse_data_var[2]
self._infer_results["movieid"] = self._sparse_data_var[6]
self._infer_results["label"] = self._sparse_data_var[-1]
self._infer_results["predict"] = self.predict
return
#self._metrics["AUC"] = auc
#self._metrics["BATCH_AUC"] = batch_auc
#cost = fluid.layers.cross_entropy(
# input=self.predict, label=self.label_input)
cost = fluid.layers.square_error_cost(
self.predict,
fluid.layers.cast(
x=self.label_input, dtype='float32'))
avg_cost = fluid.layers.reduce_mean(cost)
self._cost = avg_cost
self._metrics["LOSS"] = avg_cost
def optimizer(self):
optimizer = fluid.optimizer.Adam(self.learning_rate, lazy_mode=True)
return optimizer
def infer_net(self):
pass
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "demo/movie_recommand"
# list of dataset
dataset:
- name: dataset_train # name of dataset to distinguish different datasets
batch_size: 128
type: QueueDataset
data_path: "{workspace}/data/train"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
- name: dataset_infer # name
batch_size: 128
type: DataLoader
data_path: "{workspace}/data/test"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
- name: dataset_online_infer # name
batch_size: 128
type: DataLoader
data_path: "{workspace}/data/online_user/test"
sparse_slots: "logid time userid gender age occupation movieid title genres label"
dense_slots: ""
# hyper parameters of user-defined network
hyper_parameters:
# optimizer config
optimizer:
class: Adam
learning_rate: 0.001
strategy: async
# user-defined <key, value> pairs
sparse_feature_number: 60000000
sparse_feature_dim: 9
dense_input_dim: 13
fc_sizes: [512, 256, 128, 32]
# train
mode: runner_train
## online or offline infer
#mode: runner_infer
runner:
- name: runner_train
class: single_train
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
epochs: 10
device: cpu
- name: runner_infer
epochs: 1
class: single_infer
print_interval: 10000
init_model_path: "increment/9" # load model path
#train
phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 12
##offline infer
#phase:
#- name: phase1
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# save_path: "./infer_result"
# thread_num: 1
##offline infer
#phase:
#- name: phase1
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_online_infer # select dataset by name
# save_path: "./infer_result"
# thread_num: 1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def _init_hyper_parameters(self):
self.is_distributed = True if envs.get_trainer(
) == "CtrTrainer" else False
self.sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number")
self.sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.learning_rate = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")
self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes")
def net(self, input, is_infer=False):
self.user_sparse_inputs = self._sparse_data_var[2:6]
self.mov_sparse_inputs = self._sparse_data_var[6:9]
self.label_input = self._sparse_data_var[-1]
def fc(input):
fcs = [input]
for size in self.hidden_layers:
output = fluid.layers.fc(
input=fcs[-1],
size=size,
act='relu',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Normal(
scale=1.0 / math.sqrt(fcs[-1].shape[1]))))
fcs.append(output)
return fcs[-1]
def embedding_layer(input):
emb = fluid.layers.embedding(
input=input,
is_sparse=True,
is_distributed=self.is_distributed,
size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=fluid.ParamAttr(
name="emb", initializer=fluid.initializer.Uniform()), )
emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum')
return emb_sum
user_sparse_embed_seq = list(
map(embedding_layer, self.user_sparse_inputs))
mov_sparse_embed_seq = list(
map(embedding_layer, self.mov_sparse_inputs))
concated_user = fluid.layers.concat(user_sparse_embed_seq, axis=1)
concated_mov = fluid.layers.concat(mov_sparse_embed_seq, axis=1)
usr_combined_features = fc(concated_user)
mov_combined_features = fc(concated_mov)
sim = fluid.layers.cos_sim(
X=usr_combined_features, Y=mov_combined_features)
predict = fluid.layers.scale(sim, scale=5)
self.predict = predict
if is_infer:
self._infer_results["uid"] = self._sparse_data_var[2]
self._infer_results["movieid"] = self._sparse_data_var[6]
self._infer_results["label"] = self._sparse_data_var[-1]
self._infer_results["predict"] = self.predict
return
cost = fluid.layers.square_error_cost(
self.predict,
fluid.layers.cast(
x=self.label_input, dtype='float32'))
avg_cost = fluid.layers.reduce_mean(cost)
self._cost = avg_cost
self._metrics["LOSS"] = avg_cost
def optimizer(self):
optimizer = fluid.optimizer.Adam(self.learning_rate, lazy_mode=True)
return optimizer
cd recall
python -m paddlerec.run -m ./config.yaml
cd ../rank
python -m paddlerec.run -m ./config.yaml &> train_log &
cd ..
echo "recall offline test: "
python infer_analys
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册