提交 924961ba 编写于 作者: H helinwang 提交者: GitHub

Merge pull request #198 from reyoung/feature/add_v2_code

Add V2 code for recommendation
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer.PyDataProvider2 import *
def meta_to_header(meta, name):
metas = meta[name]['__meta__']['raw_meta']
for each_meta in metas:
slot_name = each_meta.get('name', '%s_id' % name)
if each_meta['type'] == 'id':
yield slot_name, integer_value(each_meta['max'])
elif each_meta['type'] == 'embedding':
is_seq = each_meta['seq'] == 'sequence'
yield slot_name, integer_value(
len(each_meta['dict']),
seq_type=SequenceType.SEQUENCE
if is_seq else SequenceType.NO_SEQUENCE)
elif each_meta['type'] == 'one_hot_dense':
yield slot_name, dense_vector(len(each_meta['dict']))
{
"user": {
"file": {
"name": "users.dat",
"delimiter": "::"
},
"fields": ["id", "gender", "age", "occupation"]
},
"movie": {
"file": {
"name": "movies.dat",
"delimiter": "::"
},
"fields": ["id", "title", "genres"]
}
}
#!/bin/env python2
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
config_generator.py
Usage:
./config_generator.py <config_file> [--output_format=<output_format>]
./config_generator.py -h | --help
Options:
-h --help Show this screen.
--output_format=<output_format> Output Config format(json or yaml) [default: json].
"""
import json
import docopt
import copy
DEFAULT_FILE = {"type": "split", "delimiter": ","}
DEFAULT_FIELD = {
"id": {
"type": "id"
},
"gender": {
"name": "gender",
"type": "embedding",
"dict": {
"type": "char_based"
}
},
"age": {
"name": "age",
"type": "embedding",
"dict": {
"type": "whole_content",
"sort": True
}
},
"occupation": {
"name": "occupation",
"type": "embedding",
"dict": {
"type": "whole_content",
"sort": "true"
}
},
"title": {
"regex": {
"pattern": r"^(.*)\((\d+)\)$",
"group_id": 1,
"strip": True
},
"name": "title",
"type": {
"name": "embedding",
"seq_type": "sequence",
},
"dict": {
"type": "char_based"
}
},
"genres": {
"type": "one_hot_dense",
"dict": {
"type": "split",
"delimiter": "|"
},
"name": "genres"
}
}
def merge_dict(master_dict, slave_dict):
return dict(((k, master_dict.get(k) or slave_dict.get(k))
for k in set(slave_dict) | set(master_dict)))
def main(filename, fmt):
with open(filename, 'r') as f:
conf = json.load(f)
obj = dict()
for k in conf:
val = conf[k]
file_dict = val['file']
file_dict = merge_dict(file_dict, DEFAULT_FILE)
fields = []
for pos, field_key in enumerate(val['fields']):
assert isinstance(field_key, basestring)
field = copy.deepcopy(DEFAULT_FIELD[field_key])
field['pos'] = pos
fields.append(field)
obj[k] = {"file": file_dict, "fields": fields}
meta = {"meta": obj}
# print meta
if fmt == 'json':
def formatter(x):
import json
return json.dumps(x, indent=2)
elif fmt == 'yaml':
def formatter(x):
import yaml
return yaml.safe_dump(x, default_flow_style=False)
else:
raise NotImplementedError("Dump format %s is not implemented" % fmt)
print formatter(meta)
if __name__ == '__main__':
args = docopt.docopt(__doc__, version="0.1.0")
main(args["<config_file>"], args["--output_format"])
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -ex
cd "$(dirname "$0")"
# download the dataset
wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
# unzip the dataset
unzip ml-1m.zip
# remove the unused zip file
rm ml-1m.zip
#!/bin/env python2
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess Movielens dataset, to get movie/user object.
Usage:
./preprocess.py <dataset_dir> <binary_filename> [--config=<config_file>]
./preprocess.py -h | --help
Options:
-h --help Show this screen.
--version Show version.
--config=<config_file> Get MetaData config file [default: config.json].
"""
import docopt
import os
import sys
import re
import collections
try:
import cPickle as pickle
except ImportError:
import pickle
class UniqueIDGenerator(object):
def __init__(self):
self.pool = collections.defaultdict(self.__next_id__)
self.next_id = 0
def __next_id__(self):
tmp = self.next_id
self.next_id += 1
return tmp
def __call__(self, k):
return self.pool[k]
def to_list(self):
ret_val = [None] * len(self.pool)
for k in self.pool.keys():
ret_val[self.pool[k]] = k
return ret_val
class SortedIDGenerator(object):
def __init__(self):
self.__key_set__ = set()
self.dict = None
def scan(self, key):
self.__key_set__.add(key)
def finish_scan(self, compare=None, key=None, reverse=False):
self.__key_set__ = sorted(
list(self.__key_set__), cmp=compare, key=key, reverse=reverse)
self.dict = dict()
for idx, each_key in enumerate(self.__key_set__):
self.dict[each_key] = idx
def __call__(self, key):
return self.dict[key]
def to_list(self):
return self.__key_set__
class SplitFileReader(object):
def __init__(self, work_dir, config):
assert isinstance(config, dict)
self.filename = config['name']
self.delimiter = config.get('delimiter', ',')
self.work_dir = work_dir
def read(self):
with open(os.path.join(self.work_dir, self.filename), 'r') as f:
for line in f:
line = line.strip()
if isinstance(self.delimiter, unicode):
self.delimiter = str(self.delimiter)
yield line.split(self.delimiter)
@staticmethod
def create(work_dir, config):
assert isinstance(config, dict)
if config['type'] == 'split':
return SplitFileReader(work_dir, config)
class IFileReader(object):
READERS = [SplitFileReader]
def read(self):
raise NotImplementedError()
@staticmethod
def create(work_dir, config):
for reader_cls in IFileReader.READERS:
val = reader_cls.create(work_dir, config)
if val is not None:
return val
class IDFieldParser(object):
TYPE = 'id'
def __init__(self, config):
self.__max_id__ = -sys.maxint - 1
self.__min_id__ = sys.maxint
self.__id_count__ = 0
def scan(self, line):
idx = int(line)
self.__max_id__ = max(self.__max_id__, idx)
self.__min_id__ = min(self.__min_id__, idx)
self.__id_count__ += 1
def parse(self, line):
return int(line)
def meta_field(self):
return {
"is_key": True,
'max': self.__max_id__,
'min': self.__min_id__,
'count': self.__id_count__,
'type': 'id'
}
class SplitEmbeddingDict(object):
def __init__(self, delimiter):
self.__id__ = UniqueIDGenerator()
self.delimiter = delimiter
def scan(self, multi):
for val in multi.split(self.delimiter):
self.__id__(val)
def parse(self, multi):
return map(self.__id__, multi.split(self.delimiter))
def meta_field(self):
return self.__id__.to_list()
class EmbeddingFieldParser(object):
TYPE = 'embedding'
NO_SEQUENCE = "no_sequence"
SEQUENCE = "sequence"
class CharBasedEmbeddingDict(object):
def __init__(self, is_seq=True):
self.__id__ = UniqueIDGenerator()
self.is_seq = is_seq
def scan(self, s):
for ch in s:
self.__id__(ch)
def parse(self, s):
return map(self.__id__, s) if self.is_seq else self.__id__(s[0])
def meta_field(self):
return self.__id__.to_list()
class WholeContentDict(object):
def __init__(self, need_sort=True):
assert need_sort
self.__id__ = SortedIDGenerator()
self.__has_finished__ = False
def scan(self, txt):
self.__id__.scan(txt)
def meta_field(self):
if not self.__has_finished__:
self.__id__.finish_scan()
self.__has_finished__ = True
return self.__id__.to_list()
def parse(self, txt):
return self.__id__(txt)
def __init__(self, config):
try:
self.seq_type = config['type']['seq_type']
except TypeError:
self.seq_type = EmbeddingFieldParser.NO_SEQUENCE
if config['dict']['type'] == 'char_based':
self.dict = EmbeddingFieldParser.CharBasedEmbeddingDict(
self.seq_type == EmbeddingFieldParser.SEQUENCE)
elif config['dict']['type'] == 'split':
self.dict = SplitEmbeddingDict(config['dict'].get('delimiter', ','))
elif config['dict']['type'] == 'whole_content':
self.dict = EmbeddingFieldParser.WholeContentDict(
config['dict']['sort'])
else:
print config
assert False
self.name = config['name']
def scan(self, s):
self.dict.scan(s)
def meta_field(self):
return {
'name': self.name,
'dict': self.dict.meta_field(),
'type': 'embedding',
'seq': self.seq_type
}
def parse(self, s):
return self.dict.parse(s)
class OneHotDenseFieldParser(object):
TYPE = 'one_hot_dense'
def __init__(self, config):
if config['dict']['type'] == 'split':
self.dict = SplitEmbeddingDict(config['dict']['delimiter'])
self.name = config['name']
def scan(self, s):
self.dict.scan(s)
def meta_field(self):
# print self.dict.meta_field()
return {
'dict': self.dict.meta_field(),
'name': self.name,
'type': 'one_hot_dense'
}
def parse(self, s):
ids = self.dict.parse(s)
retv = [0.0] * len(self.dict.meta_field())
for idx in ids:
retv[idx] = 1.0
# print retv
return retv
class FieldParserFactory(object):
PARSERS = [IDFieldParser, EmbeddingFieldParser, OneHotDenseFieldParser]
@staticmethod
def create(config):
if isinstance(config['type'], basestring):
config_type = config['type']
elif isinstance(config['type'], dict):
config_type = config['type']['name']
assert config_type is not None
for each_parser_cls in FieldParserFactory.PARSERS:
if config_type == each_parser_cls.TYPE:
return each_parser_cls(config)
print config
class CompositeFieldParser(object):
def __init__(self, parser, extractor):
self.extractor = extractor
self.parser = parser
def scan(self, *args, **kwargs):
self.parser.scan(self.extractor.extract(*args, **kwargs))
def parse(self, *args, **kwargs):
return self.parser.parse(self.extractor.extract(*args, **kwargs))
def meta_field(self):
return self.parser.meta_field()
class PositionContentExtractor(object):
def __init__(self, pos):
self.pos = pos
def extract(self, line):
assert isinstance(line, list)
return line[self.pos]
class RegexPositionContentExtractor(PositionContentExtractor):
def __init__(self, pos, pattern, group_id, strip=True):
PositionContentExtractor.__init__(self, pos)
pattern = pattern.strip()
self.pattern = re.compile(pattern)
self.group_id = group_id
self.strip = strip
def extract(self, line):
line = PositionContentExtractor.extract(self, line)
match = self.pattern.match(line)
# print line, self.pattern.pattern, match
assert match is not None
txt = match.group(self.group_id)
if self.strip:
txt.strip()
return txt
class ContentExtractorFactory(object):
def extract(self, line):
pass
@staticmethod
def create(config):
if 'pos' in config:
if 'regex' not in config:
return PositionContentExtractor(config['pos'])
else:
extra_args = config['regex']
return RegexPositionContentExtractor(
pos=config['pos'], **extra_args)
class MetaFile(object):
def __init__(self, work_dir):
self.work_dir = work_dir
self.obj = dict()
def parse(self, config):
config = config['meta']
ret_obj = dict()
for key in config.keys():
val = config[key]
assert 'file' in val
reader = IFileReader.create(self.work_dir, val['file'])
assert reader is not None
assert 'fields' in val and isinstance(val['fields'], list)
fields_config = val['fields']
field_parsers = map(MetaFile.__field_config_mapper__, fields_config)
for each_parser in field_parsers:
assert each_parser is not None
for each_block in reader.read():
for each_parser in field_parsers:
each_parser.scan(each_block)
metas = map(lambda x: x.meta_field(), field_parsers)
# print metas
key_index = filter(
lambda x: x is not None,
map(lambda (idx, meta): idx if 'is_key' in meta and meta['is_key'] else None,
enumerate(metas)))[0]
key_map = []
for i in range(min(key_index, len(metas))):
key_map.append(i)
for i in range(key_index + 1, len(metas)):
key_map.append(i)
obj = {'__meta__': {'raw_meta': metas, 'feature_map': key_map}}
for each_block in reader.read():
idx = field_parsers[key_index].parse(each_block)
val = []
for i, each_parser in enumerate(field_parsers):
if i != key_index:
val.append(each_parser.parse(each_block))
obj[idx] = val
ret_obj[key] = obj
self.obj = ret_obj
return ret_obj
@staticmethod
def __field_config_mapper__(conf):
assert isinstance(conf, dict)
extrator = ContentExtractorFactory.create(conf)
field_parser = FieldParserFactory.create(conf)
assert extrator is not None
assert field_parser is not None
return CompositeFieldParser(field_parser, extrator)
def dump(self, fp):
pickle.dump(self.obj, fp, pickle.HIGHEST_PROTOCOL)
def preprocess(binary_filename, dataset_dir, config, **kwargs):
assert isinstance(config, str)
with open(config, 'r') as config_file:
file_loader = None
if config.lower().endswith('.yaml'):
import yaml
file_loader = yaml
elif config.lower().endswith('.json'):
import json
file_loader = json
config = file_loader.load(config_file)
meta = MetaFile(dataset_dir)
meta.parse(config)
with open(binary_filename, 'wb') as outf:
meta.dump(outf)
if __name__ == '__main__':
args = docopt.docopt(__doc__, version='0.1.0')
kwargs = dict()
for key in args.keys():
if key != '--help':
param_name = key
assert isinstance(param_name, str)
param_name = param_name.replace('<', '')
param_name = param_name.replace('>', '')
param_name = param_name.replace('--', '')
kwargs[param_name] = args[key]
preprocess(**kwargs)
#!/bin/env python2
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Separate movielens 1m dataset to train/test file.
Usage:
./separate.py <input_file> [--test_ratio=<test_ratio>] [--delimiter=<delimiter>]
./separate.py -h | --help
Options:
-h --help Show this screen.
--version Show version.
--test_ratio=<test_ratio> Test ratio for separate [default: 0.1].
--delimiter=<delimiter> File delimiter [default: ,].
"""
import docopt
import collections
import random
def process(test_ratio, input_file, delimiter, **kwargs):
test_ratio = float(test_ratio)
rating_dict = collections.defaultdict(list)
with open(input_file, 'r') as f:
for line in f:
user_id = int(line.split(delimiter)[0])
rating_dict[user_id].append(line.strip())
with open(input_file + ".train", 'w') as train_file:
with open(input_file + ".test", 'w') as test_file:
for k in rating_dict.keys():
lines = rating_dict[k]
assert isinstance(lines, list)
random.shuffle(lines)
test_len = int(len(lines) * test_ratio)
for line in lines[:test_len]:
print >> test_file, line
for line in lines[test_len:]:
print >> train_file, line
if __name__ == '__main__':
args = docopt.docopt(__doc__, version='0.1.0')
kwargs = dict()
for key in args.keys():
if key != '--help':
param_name = key
assert isinstance(param_name, str)
param_name = param_name.replace('<', '')
param_name = param_name.replace('>', '')
param_name = param_name.replace('--', '')
kwargs[param_name] = args[key]
process(**kwargs)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer.PyDataProvider2 import *
from common_utils import meta_to_header
def __list_to_map__(lst):
ret_val = dict()
for each in lst:
k, v = each
ret_val[k] = v
return ret_val
def hook(settings, meta, **kwargs):
"""
Init hook is invoked before process data. It will set obj.slots and store
data meta.
:param obj: global object. It will passed to process routine.
:type obj: object
:param meta: the meta file object, which passed from trainer_config. Meta
file record movie/user features.
:param kwargs: unused other arguments.
"""
# Header define slots that used for paddle.
# first part is movie features.
# second part is user features.
# final part is rating score.
# header is a list of [USE_SEQ_OR_NOT?, SlotType]
movie_headers = list(meta_to_header(meta, 'movie'))
settings.movie_names = [h[0] for h in movie_headers]
headers = movie_headers
user_headers = list(meta_to_header(meta, 'user'))
settings.user_names = [h[0] for h in user_headers]
headers.extend(user_headers)
headers.append(("rating", dense_vector(1))) # Score
# slot types.
settings.input_types = __list_to_map__(headers)
settings.meta = meta
@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, filename):
with open(filename, 'r') as f:
for line in f:
# Get a rating from file.
user_id, movie_id, score = map(int, line.split('::')[:-1])
# Scale score to [-2, +2]
score = float(score - 3)
# Get movie/user features by movie_id, user_id
movie_meta = settings.meta['movie'][movie_id]
user_meta = settings.meta['user'][user_id]
outputs = [('movie_id', movie_id - 1)]
# Then add movie features
for i, each_meta in enumerate(movie_meta):
outputs.append((settings.movie_names[i + 1], each_meta))
# Then add user id.
outputs.append(('user_id', user_id - 1))
# Then add user features.
for i, each_meta in enumerate(user_meta):
outputs.append((settings.user_names[i + 1], each_meta))
# Finally, add score
outputs.append(('rating', [score]))
# Return data to paddle
yield __list_to_map__(outputs)
#!/usr/bin/python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import re
import math
def get_best_pass(log_filename):
with open(log_filename, 'r') as f:
text = f.read()
pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
re.S)
results = re.findall(pattern, text)
sorted_results = sorted(results, key=lambda result: float(result[0]))
return sorted_results[0]
log_filename = sys.argv[1]
log = get_best_pass(log_filename)
predict_error = math.sqrt(float(log[0])) / 2
print 'Best pass is %s, error is %s, which means predict get error as %f' % (
log[1], log[0], predict_error)
evaluate_pass = "output/pass-%s" % log[1]
print "evaluating from pass %s" % evaluate_pass
#!/bin/env python2
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from py_paddle import swig_paddle, DataProviderConverter
from common_utils import *
from paddle.trainer.config_parser import parse_config
try:
import cPickle as pickle
except ImportError:
import pickle
import sys
if __name__ == '__main__':
model_path = sys.argv[1]
swig_paddle.initPaddle('--use_gpu=0')
conf = parse_config("trainer_config.py", "is_predict=1")
network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(network, swig_paddle.GradientMachine)
network.loadParameters(model_path)
with open('./data/meta.bin', 'rb') as f:
meta = pickle.load(f)
headers = [h[1] for h in meta_to_header(meta, 'movie')]
headers.extend([h[1] for h in meta_to_header(meta, 'user')])
cvt = DataProviderConverter(headers)
while True:
movie_id = int(raw_input("Input movie_id: "))
user_id = int(raw_input("Input user_id: "))
movie_meta = meta['movie'][movie_id] # Query Data From Meta.
user_meta = meta['user'][user_id]
data = [movie_id - 1]
data.extend(movie_meta)
data.append(user_id - 1)
data.extend(user_meta)
print "Prediction Score is %.2f" % (
network.forwardTest(cvt.convert([data]))[0]['value'][0][0] + 3)
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
UNAME_STR=`uname`
if [[ ${UNAME_STR} == 'Linux' ]]; then
SHUF_PROG='shuf'
else
SHUF_PROG='gshuf'
fi
cd "$(dirname "$0")"
delimiter='::'
dir=ml-1m
cd data
echo 'generate meta config file'
python config_generator.py config.json > meta_config.json
echo 'generate meta file'
python meta_generator.py $dir meta.bin --config=meta_config.json
echo 'split train/test file'
python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1
echo 'shuffle train file'
${SHUF_PROG} $dir/ratings.dat.train > ratings.dat.train
cp $dir/ratings.dat.test .
echo "./data/ratings.dat.train" > train.list
echo "./data/ratings.dat.test" > test.list
import paddle.v2 as paddle
import cPickle
import copy
def main():
paddle.init(use_gpu=False)
movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()
uid = paddle.layer.data(
name='user_id',
type=paddle.data_type.integer_value(
paddle.dataset.movielens.max_user_id() + 1))
usr_emb = paddle.layer.embedding(input=uid, size=32)
usr_gender_id = paddle.layer.data(
name='gender_id', type=paddle.data_type.integer_value(2))
usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)
usr_age_id = paddle.layer.data(
name='age_id',
type=paddle.data_type.integer_value(
len(paddle.dataset.movielens.age_table)))
usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)
usr_job_id = paddle.layer.data(
name='job_id',
type=paddle.data_type.integer_value(
paddle.dataset.movielens.max_job_id() + 1))
usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)
usr_combined_features = paddle.layer.fc(
input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb],
size=200,
act=paddle.activation.Tanh())
mov_id = paddle.layer.data(
name='movie_id',
type=paddle.data_type.integer_value(
paddle.dataset.movielens.max_movie_id() + 1))
mov_emb = paddle.layer.embedding(input=mov_id, size=32)
mov_categories = paddle.layer.data(
name='category_id',
type=paddle.data_type.sparse_binary_vector(
len(paddle.dataset.movielens.movie_categories())))
mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)
mov_title_id = paddle.layer.data(
name='movie_title',
type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))
mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)
mov_title_conv = paddle.networks.sequence_conv_pool(
input=mov_title_emb, hidden_size=32, context_len=3)
mov_combined_features = paddle.layer.fc(
input=[mov_emb, mov_categories_hidden, mov_title_conv],
size=200,
act=paddle.activation.Tanh())
inference = paddle.layer.cos_sim(
a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
cost = paddle.layer.regression_cost(
input=inference,
label=paddle.layer.data(
name='score', type=paddle.data_type.dense_vector(1)))
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost,
parameters=parameters,
update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
feeding = {
'user_id': 0,
'gender_id': 1,
'age_id': 2,
'job_id': 3,
'movie_id': 4,
'category_id': 5,
'movie_title': 6,
'score': 7
}
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d Batch %d Cost %.2f" % (
event.pass_id, event.batch_id, event.cost)
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(
paddle.dataset.movielens.train(), buf_size=8192),
batch_size=256),
event_handler=event_handler,
feeding=feeding,
num_passes=1)
user_id = 234
movie_id = 345
user = paddle.dataset.movielens.user_info()[user_id]
movie = paddle.dataset.movielens.movie_info()[movie_id]
feature = user.value() + movie.value()
def reader():
yield feature
infer_dict = copy.copy(feeding)
del infer_dict['score']
prediction = paddle.infer(
output_layer=inference,
parameters=parameters,
input=[feature],
feeding=infer_dict)
print(prediction + 5) / 2
if __name__ == '__main__':
main()
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
paddle train \
--config=trainer_config.py \
--save_dir=./output \
--use_gpu=false \
--trainer_count=4\
--test_all_data_in_one_period=true \
--log_period=100 \
--dot_period=1 \
--num_passes=50 2>&1 | tee 'log.txt'
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
try:
import cPickle as pickle
except ImportError:
import pickle
is_predict = get_config_arg('is_predict', bool, False)
META_FILE = 'data/meta.bin'
with open(META_FILE, 'rb') as f:
# load meta file
meta = pickle.load(f)
if not is_predict:
define_py_data_sources2(
'data/train.list',
'data/test.list',
module='dataprovider',
obj='process',
args={'meta': meta})
settings(
batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer())
movie_meta = meta['movie']['__meta__']['raw_meta']
user_meta = meta['user']['__meta__']['raw_meta']
movie_id = data_layer('movie_id', size=movie_meta[0]['max'])
title = data_layer('title', size=len(movie_meta[1]['dict']))
genres = data_layer('genres', size=len(movie_meta[2]['dict']))
user_id = data_layer('user_id', size=user_meta[0]['max'])
gender = data_layer('gender', size=len(user_meta[1]['dict']))
age = data_layer('age', size=len(user_meta[2]['dict']))
occupation = data_layer('occupation', size=len(user_meta[3]['dict']))
embsize = 256
# construct movie feature
movie_id_emb = embedding_layer(input=movie_id, size=embsize)
movie_id_hidden = fc_layer(input=movie_id_emb, size=embsize)
genres_emb = fc_layer(input=genres, size=embsize)
title_emb = embedding_layer(input=title, size=embsize)
title_hidden = text_conv_pool(
input=title_emb, context_len=5, hidden_size=embsize)
movie_feature = fc_layer(
input=[movie_id_hidden, title_hidden, genres_emb], size=embsize)
# construct user feature
user_id_emb = embedding_layer(input=user_id, size=embsize)
user_id_hidden = fc_layer(input=user_id_emb, size=embsize)
gender_emb = embedding_layer(input=gender, size=embsize)
gender_hidden = fc_layer(input=gender_emb, size=embsize)
age_emb = embedding_layer(input=age, size=embsize)
age_hidden = fc_layer(input=age_emb, size=embsize)
occup_emb = embedding_layer(input=occupation, size=embsize)
occup_hidden = fc_layer(input=occup_emb, size=embsize)
user_feature = fc_layer(
input=[user_id_hidden, gender_hidden, age_hidden, occup_hidden],
size=embsize)
similarity = cos_sim(a=movie_feature, b=user_feature, scale=2)
if not is_predict:
lbl = data_layer('rating', size=1)
cost = regression_cost(input=similarity, label=lbl)
outputs(cost)
else:
outputs(similarity)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册