diff --git a/recommender_system/common_utils.py b/recommender_system/common_utils.py deleted file mode 100755 index c20c65286621d701ad58409b539bbe9c813d453a..0000000000000000000000000000000000000000 --- a/recommender_system/common_utils.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from paddle.trainer.PyDataProvider2 import * - - -def meta_to_header(meta, name): - metas = meta[name]['__meta__']['raw_meta'] - for each_meta in metas: - slot_name = each_meta.get('name', '%s_id' % name) - if each_meta['type'] == 'id': - yield slot_name, integer_value(each_meta['max']) - elif each_meta['type'] == 'embedding': - is_seq = each_meta['seq'] == 'sequence' - yield slot_name, integer_value( - len(each_meta['dict']), - seq_type=SequenceType.SEQUENCE - if is_seq else SequenceType.NO_SEQUENCE) - elif each_meta['type'] == 'one_hot_dense': - yield slot_name, dense_vector(len(each_meta['dict'])) diff --git a/recommender_system/data/config.json b/recommender_system/data/config.json deleted file mode 100644 index f26e74ce47bb7843a571e6033f051c046b31f054..0000000000000000000000000000000000000000 --- a/recommender_system/data/config.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "user": { - "file": { - "name": "users.dat", - "delimiter": "::" - }, - "fields": ["id", "gender", "age", "occupation"] - }, - "movie": { - "file": { - "name": "movies.dat", - "delimiter": "::" - }, - "fields": ["id", "title", "genres"] - } -} diff --git a/recommender_system/data/config_generator.py b/recommender_system/data/config_generator.py deleted file mode 100644 index 4ca496a252dffc62ed62bb8f2a5ee1661a940580..0000000000000000000000000000000000000000 --- a/recommender_system/data/config_generator.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/env python2 -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -config_generator.py - -Usage: - ./config_generator.py [--output_format=] - ./config_generator.py -h | --help - -Options: - -h --help Show this screen. - --output_format= Output Config format(json or yaml) [default: json]. -""" - -import json -import docopt -import copy - -DEFAULT_FILE = {"type": "split", "delimiter": ","} - -DEFAULT_FIELD = { - "id": { - "type": "id" - }, - "gender": { - "name": "gender", - "type": "embedding", - "dict": { - "type": "char_based" - } - }, - "age": { - "name": "age", - "type": "embedding", - "dict": { - "type": "whole_content", - "sort": True - } - }, - "occupation": { - "name": "occupation", - "type": "embedding", - "dict": { - "type": "whole_content", - "sort": "true" - } - }, - "title": { - "regex": { - "pattern": r"^(.*)\((\d+)\)$", - "group_id": 1, - "strip": True - }, - "name": "title", - "type": { - "name": "embedding", - "seq_type": "sequence", - }, - "dict": { - "type": "char_based" - } - }, - "genres": { - "type": "one_hot_dense", - "dict": { - "type": "split", - "delimiter": "|" - }, - "name": "genres" - } -} - - -def merge_dict(master_dict, slave_dict): - return dict(((k, master_dict.get(k) or slave_dict.get(k)) - for k in set(slave_dict) | set(master_dict))) - - -def main(filename, fmt): - with open(filename, 'r') as f: - conf = json.load(f) - obj = dict() - for k in conf: - val = conf[k] - file_dict = val['file'] - file_dict = merge_dict(file_dict, DEFAULT_FILE) - - fields = [] - for pos, field_key in enumerate(val['fields']): - assert isinstance(field_key, basestring) - field = copy.deepcopy(DEFAULT_FIELD[field_key]) - field['pos'] = pos - fields.append(field) - obj[k] = {"file": file_dict, "fields": fields} - meta = {"meta": obj} - # print meta - if fmt == 'json': - - def formatter(x): - import json - return json.dumps(x, indent=2) - elif fmt == 'yaml': - - def formatter(x): - import yaml - return yaml.safe_dump(x, default_flow_style=False) - else: - raise NotImplementedError("Dump format %s is not implemented" % fmt) - - print formatter(meta) - - -if __name__ == '__main__': - args = docopt.docopt(__doc__, version="0.1.0") - main(args[""], args["--output_format"]) diff --git a/recommender_system/data/getdata.sh b/recommender_system/data/getdata.sh deleted file mode 100755 index 2268d876389e0bdf5ead405e74d278d276626f82..0000000000000000000000000000000000000000 --- a/recommender_system/data/getdata.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -ex -cd "$(dirname "$0")" -# download the dataset -wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -# unzip the dataset -unzip ml-1m.zip -# remove the unused zip file -rm ml-1m.zip diff --git a/recommender_system/data/meta_generator.py b/recommender_system/data/meta_generator.py deleted file mode 100644 index b15d8ec31c4b2696ac6e8bb36a57c068f0573d81..0000000000000000000000000000000000000000 --- a/recommender_system/data/meta_generator.py +++ /dev/null @@ -1,430 +0,0 @@ -#!/bin/env python2 -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Preprocess Movielens dataset, to get movie/user object. - -Usage: - ./preprocess.py [--config=] - ./preprocess.py -h | --help - -Options: - -h --help Show this screen. - --version Show version. - --config= Get MetaData config file [default: config.json]. -""" -import docopt -import os -import sys -import re -import collections - -try: - import cPickle as pickle -except ImportError: - import pickle - - -class UniqueIDGenerator(object): - def __init__(self): - self.pool = collections.defaultdict(self.__next_id__) - self.next_id = 0 - - def __next_id__(self): - tmp = self.next_id - self.next_id += 1 - return tmp - - def __call__(self, k): - return self.pool[k] - - def to_list(self): - ret_val = [None] * len(self.pool) - for k in self.pool.keys(): - ret_val[self.pool[k]] = k - return ret_val - - -class SortedIDGenerator(object): - def __init__(self): - self.__key_set__ = set() - self.dict = None - - def scan(self, key): - self.__key_set__.add(key) - - def finish_scan(self, compare=None, key=None, reverse=False): - self.__key_set__ = sorted( - list(self.__key_set__), cmp=compare, key=key, reverse=reverse) - self.dict = dict() - for idx, each_key in enumerate(self.__key_set__): - self.dict[each_key] = idx - - def __call__(self, key): - return self.dict[key] - - def to_list(self): - return self.__key_set__ - - -class SplitFileReader(object): - def __init__(self, work_dir, config): - assert isinstance(config, dict) - self.filename = config['name'] - self.delimiter = config.get('delimiter', ',') - self.work_dir = work_dir - - def read(self): - with open(os.path.join(self.work_dir, self.filename), 'r') as f: - for line in f: - line = line.strip() - if isinstance(self.delimiter, unicode): - self.delimiter = str(self.delimiter) - yield line.split(self.delimiter) - - @staticmethod - def create(work_dir, config): - assert isinstance(config, dict) - if config['type'] == 'split': - return SplitFileReader(work_dir, config) - - -class IFileReader(object): - READERS = [SplitFileReader] - - def read(self): - raise NotImplementedError() - - @staticmethod - def create(work_dir, config): - for reader_cls in IFileReader.READERS: - val = reader_cls.create(work_dir, config) - if val is not None: - return val - - -class IDFieldParser(object): - TYPE = 'id' - - def __init__(self, config): - self.__max_id__ = -sys.maxint - 1 - self.__min_id__ = sys.maxint - self.__id_count__ = 0 - - def scan(self, line): - idx = int(line) - self.__max_id__ = max(self.__max_id__, idx) - self.__min_id__ = min(self.__min_id__, idx) - self.__id_count__ += 1 - - def parse(self, line): - return int(line) - - def meta_field(self): - return { - "is_key": True, - 'max': self.__max_id__, - 'min': self.__min_id__, - 'count': self.__id_count__, - 'type': 'id' - } - - -class SplitEmbeddingDict(object): - def __init__(self, delimiter): - self.__id__ = UniqueIDGenerator() - self.delimiter = delimiter - - def scan(self, multi): - for val in multi.split(self.delimiter): - self.__id__(val) - - def parse(self, multi): - return map(self.__id__, multi.split(self.delimiter)) - - def meta_field(self): - return self.__id__.to_list() - - -class EmbeddingFieldParser(object): - TYPE = 'embedding' - - NO_SEQUENCE = "no_sequence" - SEQUENCE = "sequence" - - class CharBasedEmbeddingDict(object): - def __init__(self, is_seq=True): - self.__id__ = UniqueIDGenerator() - self.is_seq = is_seq - - def scan(self, s): - for ch in s: - self.__id__(ch) - - def parse(self, s): - return map(self.__id__, s) if self.is_seq else self.__id__(s[0]) - - def meta_field(self): - return self.__id__.to_list() - - class WholeContentDict(object): - def __init__(self, need_sort=True): - assert need_sort - self.__id__ = SortedIDGenerator() - self.__has_finished__ = False - - def scan(self, txt): - self.__id__.scan(txt) - - def meta_field(self): - if not self.__has_finished__: - self.__id__.finish_scan() - self.__has_finished__ = True - return self.__id__.to_list() - - def parse(self, txt): - return self.__id__(txt) - - def __init__(self, config): - try: - self.seq_type = config['type']['seq_type'] - except TypeError: - self.seq_type = EmbeddingFieldParser.NO_SEQUENCE - - if config['dict']['type'] == 'char_based': - self.dict = EmbeddingFieldParser.CharBasedEmbeddingDict( - self.seq_type == EmbeddingFieldParser.SEQUENCE) - elif config['dict']['type'] == 'split': - self.dict = SplitEmbeddingDict(config['dict'].get('delimiter', ',')) - elif config['dict']['type'] == 'whole_content': - self.dict = EmbeddingFieldParser.WholeContentDict( - config['dict']['sort']) - else: - print config - assert False - - self.name = config['name'] - - def scan(self, s): - self.dict.scan(s) - - def meta_field(self): - return { - 'name': self.name, - 'dict': self.dict.meta_field(), - 'type': 'embedding', - 'seq': self.seq_type - } - - def parse(self, s): - return self.dict.parse(s) - - -class OneHotDenseFieldParser(object): - TYPE = 'one_hot_dense' - - def __init__(self, config): - if config['dict']['type'] == 'split': - self.dict = SplitEmbeddingDict(config['dict']['delimiter']) - self.name = config['name'] - - def scan(self, s): - self.dict.scan(s) - - def meta_field(self): - # print self.dict.meta_field() - return { - 'dict': self.dict.meta_field(), - 'name': self.name, - 'type': 'one_hot_dense' - } - - def parse(self, s): - ids = self.dict.parse(s) - retv = [0.0] * len(self.dict.meta_field()) - for idx in ids: - retv[idx] = 1.0 - # print retv - return retv - - -class FieldParserFactory(object): - PARSERS = [IDFieldParser, EmbeddingFieldParser, OneHotDenseFieldParser] - - @staticmethod - def create(config): - if isinstance(config['type'], basestring): - config_type = config['type'] - elif isinstance(config['type'], dict): - config_type = config['type']['name'] - - assert config_type is not None - - for each_parser_cls in FieldParserFactory.PARSERS: - if config_type == each_parser_cls.TYPE: - return each_parser_cls(config) - print config - - -class CompositeFieldParser(object): - def __init__(self, parser, extractor): - self.extractor = extractor - self.parser = parser - - def scan(self, *args, **kwargs): - self.parser.scan(self.extractor.extract(*args, **kwargs)) - - def parse(self, *args, **kwargs): - return self.parser.parse(self.extractor.extract(*args, **kwargs)) - - def meta_field(self): - return self.parser.meta_field() - - -class PositionContentExtractor(object): - def __init__(self, pos): - self.pos = pos - - def extract(self, line): - assert isinstance(line, list) - return line[self.pos] - - -class RegexPositionContentExtractor(PositionContentExtractor): - def __init__(self, pos, pattern, group_id, strip=True): - PositionContentExtractor.__init__(self, pos) - pattern = pattern.strip() - self.pattern = re.compile(pattern) - self.group_id = group_id - self.strip = strip - - def extract(self, line): - line = PositionContentExtractor.extract(self, line) - match = self.pattern.match(line) - # print line, self.pattern.pattern, match - assert match is not None - txt = match.group(self.group_id) - if self.strip: - txt.strip() - return txt - - -class ContentExtractorFactory(object): - def extract(self, line): - pass - - @staticmethod - def create(config): - if 'pos' in config: - if 'regex' not in config: - return PositionContentExtractor(config['pos']) - else: - extra_args = config['regex'] - return RegexPositionContentExtractor( - pos=config['pos'], **extra_args) - - -class MetaFile(object): - def __init__(self, work_dir): - self.work_dir = work_dir - self.obj = dict() - - def parse(self, config): - config = config['meta'] - - ret_obj = dict() - for key in config.keys(): - val = config[key] - assert 'file' in val - reader = IFileReader.create(self.work_dir, val['file']) - assert reader is not None - assert 'fields' in val and isinstance(val['fields'], list) - fields_config = val['fields'] - field_parsers = map(MetaFile.__field_config_mapper__, fields_config) - - for each_parser in field_parsers: - assert each_parser is not None - - for each_block in reader.read(): - for each_parser in field_parsers: - each_parser.scan(each_block) - - metas = map(lambda x: x.meta_field(), field_parsers) - # print metas - key_index = filter( - lambda x: x is not None, - map(lambda (idx, meta): idx if 'is_key' in meta and meta['is_key'] else None, - enumerate(metas)))[0] - - key_map = [] - for i in range(min(key_index, len(metas))): - key_map.append(i) - for i in range(key_index + 1, len(metas)): - key_map.append(i) - - obj = {'__meta__': {'raw_meta': metas, 'feature_map': key_map}} - - for each_block in reader.read(): - idx = field_parsers[key_index].parse(each_block) - val = [] - for i, each_parser in enumerate(field_parsers): - if i != key_index: - val.append(each_parser.parse(each_block)) - obj[idx] = val - ret_obj[key] = obj - self.obj = ret_obj - return ret_obj - - @staticmethod - def __field_config_mapper__(conf): - assert isinstance(conf, dict) - extrator = ContentExtractorFactory.create(conf) - field_parser = FieldParserFactory.create(conf) - assert extrator is not None - assert field_parser is not None - return CompositeFieldParser(field_parser, extrator) - - def dump(self, fp): - pickle.dump(self.obj, fp, pickle.HIGHEST_PROTOCOL) - - -def preprocess(binary_filename, dataset_dir, config, **kwargs): - assert isinstance(config, str) - with open(config, 'r') as config_file: - file_loader = None - if config.lower().endswith('.yaml'): - import yaml - file_loader = yaml - elif config.lower().endswith('.json'): - import json - file_loader = json - config = file_loader.load(config_file) - meta = MetaFile(dataset_dir) - meta.parse(config) - with open(binary_filename, 'wb') as outf: - meta.dump(outf) - - -if __name__ == '__main__': - args = docopt.docopt(__doc__, version='0.1.0') - kwargs = dict() - for key in args.keys(): - if key != '--help': - param_name = key - assert isinstance(param_name, str) - param_name = param_name.replace('<', '') - param_name = param_name.replace('>', '') - param_name = param_name.replace('--', '') - kwargs[param_name] = args[key] - preprocess(**kwargs) diff --git a/recommender_system/data/requirements.txt b/recommender_system/data/requirements.txt deleted file mode 100644 index 1ea154584a428b6a389309f1f8def502e0aadfce..0000000000000000000000000000000000000000 --- a/recommender_system/data/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -PyYAML -docopt diff --git a/recommender_system/data/split.py b/recommender_system/data/split.py deleted file mode 100644 index be6869c22f04be1db0f8e9c35c73c851e4c490b0..0000000000000000000000000000000000000000 --- a/recommender_system/data/split.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/env python2 -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Separate movielens 1m dataset to train/test file. - -Usage: - ./separate.py [--test_ratio=] [--delimiter=] - ./separate.py -h | --help - -Options: - -h --help Show this screen. - --version Show version. - --test_ratio= Test ratio for separate [default: 0.1]. - --delimiter= File delimiter [default: ,]. -""" -import docopt -import collections -import random - - -def process(test_ratio, input_file, delimiter, **kwargs): - test_ratio = float(test_ratio) - rating_dict = collections.defaultdict(list) - with open(input_file, 'r') as f: - for line in f: - user_id = int(line.split(delimiter)[0]) - rating_dict[user_id].append(line.strip()) - - with open(input_file + ".train", 'w') as train_file: - with open(input_file + ".test", 'w') as test_file: - for k in rating_dict.keys(): - lines = rating_dict[k] - assert isinstance(lines, list) - random.shuffle(lines) - test_len = int(len(lines) * test_ratio) - for line in lines[:test_len]: - print >> test_file, line - - for line in lines[test_len:]: - print >> train_file, line - - -if __name__ == '__main__': - args = docopt.docopt(__doc__, version='0.1.0') - kwargs = dict() - for key in args.keys(): - if key != '--help': - param_name = key - assert isinstance(param_name, str) - param_name = param_name.replace('<', '') - param_name = param_name.replace('>', '') - param_name = param_name.replace('--', '') - kwargs[param_name] = args[key] - process(**kwargs) diff --git a/recommender_system/dataprovider.py b/recommender_system/dataprovider.py deleted file mode 100755 index 54a5ea6fb8e59fa559a394a0c2ec7ac07d89c2f8..0000000000000000000000000000000000000000 --- a/recommender_system/dataprovider.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * -from common_utils import meta_to_header - - -def __list_to_map__(lst): - ret_val = dict() - for each in lst: - k, v = each - ret_val[k] = v - return ret_val - - -def hook(settings, meta, **kwargs): - """ - Init hook is invoked before process data. It will set obj.slots and store - data meta. - - :param obj: global object. It will passed to process routine. - :type obj: object - :param meta: the meta file object, which passed from trainer_config. Meta - file record movie/user features. - :param kwargs: unused other arguments. - """ - - # Header define slots that used for paddle. - # first part is movie features. - # second part is user features. - # final part is rating score. - # header is a list of [USE_SEQ_OR_NOT?, SlotType] - movie_headers = list(meta_to_header(meta, 'movie')) - settings.movie_names = [h[0] for h in movie_headers] - headers = movie_headers - user_headers = list(meta_to_header(meta, 'user')) - settings.user_names = [h[0] for h in user_headers] - headers.extend(user_headers) - headers.append(("rating", dense_vector(1))) # Score - - # slot types. - settings.input_types = __list_to_map__(headers) - settings.meta = meta - - -@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, filename): - with open(filename, 'r') as f: - for line in f: - # Get a rating from file. - user_id, movie_id, score = map(int, line.split('::')[:-1]) - - # Scale score to [-2, +2] - score = float(score - 3) - - # Get movie/user features by movie_id, user_id - movie_meta = settings.meta['movie'][movie_id] - user_meta = settings.meta['user'][user_id] - - outputs = [('movie_id', movie_id - 1)] - - # Then add movie features - for i, each_meta in enumerate(movie_meta): - outputs.append((settings.movie_names[i + 1], each_meta)) - - # Then add user id. - outputs.append(('user_id', user_id - 1)) - - # Then add user features. - for i, each_meta in enumerate(user_meta): - outputs.append((settings.user_names[i + 1], each_meta)) - - # Finally, add score - outputs.append(('rating', [score])) - # Return data to paddle - yield __list_to_map__(outputs) diff --git a/recommender_system/evaluate.py b/recommender_system/evaluate.py deleted file mode 100755 index 3afa7a1e9db5fefb1bbf5aaa174b8168afae4058..0000000000000000000000000000000000000000 --- a/recommender_system/evaluate.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import sys -import re -import math - - -def get_best_pass(log_filename): - with open(log_filename, 'r') as f: - text = f.read() - pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)', - re.S) - results = re.findall(pattern, text) - sorted_results = sorted(results, key=lambda result: float(result[0])) - return sorted_results[0] - - -log_filename = sys.argv[1] -log = get_best_pass(log_filename) -predict_error = math.sqrt(float(log[0])) / 2 -print 'Best pass is %s, error is %s, which means predict get error as %f' % ( - log[1], log[0], predict_error) - -evaluate_pass = "output/pass-%s" % log[1] -print "evaluating from pass %s" % evaluate_pass diff --git a/recommender_system/prediction.py b/recommender_system/prediction.py deleted file mode 100755 index 9824d132d276dfe4ff4ea336d8c6b483949b9d08..0000000000000000000000000000000000000000 --- a/recommender_system/prediction.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/env python2 -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from py_paddle import swig_paddle, DataProviderConverter - -from common_utils import * -from paddle.trainer.config_parser import parse_config - -try: - import cPickle as pickle -except ImportError: - import pickle -import sys - -if __name__ == '__main__': - model_path = sys.argv[1] - swig_paddle.initPaddle('--use_gpu=0') - conf = parse_config("trainer_config.py", "is_predict=1") - network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - assert isinstance(network, swig_paddle.GradientMachine) - network.loadParameters(model_path) - with open('./data/meta.bin', 'rb') as f: - meta = pickle.load(f) - headers = [h[1] for h in meta_to_header(meta, 'movie')] - headers.extend([h[1] for h in meta_to_header(meta, 'user')]) - cvt = DataProviderConverter(headers) - while True: - movie_id = int(raw_input("Input movie_id: ")) - user_id = int(raw_input("Input user_id: ")) - movie_meta = meta['movie'][movie_id] # Query Data From Meta. - user_meta = meta['user'][user_id] - data = [movie_id - 1] - data.extend(movie_meta) - data.append(user_id - 1) - data.extend(user_meta) - print "Prediction Score is %.2f" % ( - network.forwardTest(cvt.convert([data]))[0]['value'][0][0] + 3) diff --git a/recommender_system/preprocess.sh b/recommender_system/preprocess.sh deleted file mode 100755 index 9603f7997d6fb46411b169e780a262def7a398b3..0000000000000000000000000000000000000000 --- a/recommender_system/preprocess.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -UNAME_STR=`uname` - -if [[ ${UNAME_STR} == 'Linux' ]]; then - SHUF_PROG='shuf' -else - SHUF_PROG='gshuf' -fi - - -cd "$(dirname "$0")" -delimiter='::' -dir=ml-1m -cd data -echo 'generate meta config file' -python config_generator.py config.json > meta_config.json -echo 'generate meta file' -python meta_generator.py $dir meta.bin --config=meta_config.json -echo 'split train/test file' -python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1 -echo 'shuffle train file' -${SHUF_PROG} $dir/ratings.dat.train > ratings.dat.train -cp $dir/ratings.dat.test . -echo "./data/ratings.dat.train" > train.list -echo "./data/ratings.dat.test" > test.list diff --git a/recommender_system/train.py b/recommender_system/train.py new file mode 100644 index 0000000000000000000000000000000000000000..62af9921feec5269e723ad6df8cbaaa9b0098bfe --- /dev/null +++ b/recommender_system/train.py @@ -0,0 +1,124 @@ +import paddle.v2 as paddle +import cPickle +import copy + + +def main(): + paddle.init(use_gpu=False) + movie_title_dict = paddle.dataset.movielens.get_movie_title_dict() + uid = paddle.layer.data( + name='user_id', + type=paddle.data_type.integer_value( + paddle.dataset.movielens.max_user_id() + 1)) + usr_emb = paddle.layer.embedding(input=uid, size=32) + + usr_gender_id = paddle.layer.data( + name='gender_id', type=paddle.data_type.integer_value(2)) + usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16) + + usr_age_id = paddle.layer.data( + name='age_id', + type=paddle.data_type.integer_value( + len(paddle.dataset.movielens.age_table))) + usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16) + + usr_job_id = paddle.layer.data( + name='job_id', + type=paddle.data_type.integer_value( + paddle.dataset.movielens.max_job_id() + 1)) + + usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16) + + usr_combined_features = paddle.layer.fc( + input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb], + size=200, + act=paddle.activation.Tanh()) + + mov_id = paddle.layer.data( + name='movie_id', + type=paddle.data_type.integer_value( + paddle.dataset.movielens.max_movie_id() + 1)) + mov_emb = paddle.layer.embedding(input=mov_id, size=32) + + mov_categories = paddle.layer.data( + name='category_id', + type=paddle.data_type.sparse_binary_vector( + len(paddle.dataset.movielens.movie_categories()))) + + mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32) + + mov_title_id = paddle.layer.data( + name='movie_title', + type=paddle.data_type.integer_value_sequence(len(movie_title_dict))) + mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32) + mov_title_conv = paddle.networks.sequence_conv_pool( + input=mov_title_emb, hidden_size=32, context_len=3) + + mov_combined_features = paddle.layer.fc( + input=[mov_emb, mov_categories_hidden, mov_title_conv], + size=200, + act=paddle.activation.Tanh()) + + inference = paddle.layer.cos_sim( + a=usr_combined_features, b=mov_combined_features, size=1, scale=5) + cost = paddle.layer.regression_cost( + input=inference, + label=paddle.layer.data( + name='score', type=paddle.data_type.dense_vector(1))) + + parameters = paddle.parameters.create(cost) + + trainer = paddle.trainer.SGD( + cost=cost, + parameters=parameters, + update_equation=paddle.optimizer.Adam(learning_rate=1e-4)) + feeding = { + 'user_id': 0, + 'gender_id': 1, + 'age_id': 2, + 'job_id': 3, + 'movie_id': 4, + 'category_id': 5, + 'movie_title': 6, + 'score': 7 + } + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d Batch %d Cost %.2f" % ( + event.pass_id, event.batch_id, event.cost) + + trainer.train( + reader=paddle.batch( + paddle.reader.shuffle( + paddle.dataset.movielens.train(), buf_size=8192), + batch_size=256), + event_handler=event_handler, + feeding=feeding, + num_passes=1) + + user_id = 234 + movie_id = 345 + + user = paddle.dataset.movielens.user_info()[user_id] + movie = paddle.dataset.movielens.movie_info()[movie_id] + + feature = user.value() + movie.value() + + def reader(): + yield feature + + infer_dict = copy.copy(feeding) + del infer_dict['score'] + + prediction = paddle.infer( + output_layer=inference, + parameters=parameters, + input=[feature], + feeding=infer_dict) + print(prediction + 5) / 2 + + +if __name__ == '__main__': + main() diff --git a/recommender_system/train.sh b/recommender_system/train.sh deleted file mode 100755 index e341d1cc7a3267bef9db916719b2e4b1981e31bc..0000000000000000000000000000000000000000 --- a/recommender_system/train.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -paddle train \ - --config=trainer_config.py \ - --save_dir=./output \ - --use_gpu=false \ - --trainer_count=4\ - --test_all_data_in_one_period=true \ - --log_period=100 \ - --dot_period=1 \ - --num_passes=50 2>&1 | tee 'log.txt' diff --git a/recommender_system/trainer_config.py b/recommender_system/trainer_config.py deleted file mode 100755 index c2eeb7b874c7667809a401347f43b873b8dea92a..0000000000000000000000000000000000000000 --- a/recommender_system/trainer_config.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -try: - import cPickle as pickle -except ImportError: - import pickle - -is_predict = get_config_arg('is_predict', bool, False) - -META_FILE = 'data/meta.bin' - -with open(META_FILE, 'rb') as f: - # load meta file - meta = pickle.load(f) - -if not is_predict: - define_py_data_sources2( - 'data/train.list', - 'data/test.list', - module='dataprovider', - obj='process', - args={'meta': meta}) - -settings( - batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer()) - -movie_meta = meta['movie']['__meta__']['raw_meta'] -user_meta = meta['user']['__meta__']['raw_meta'] - -movie_id = data_layer('movie_id', size=movie_meta[0]['max']) -title = data_layer('title', size=len(movie_meta[1]['dict'])) -genres = data_layer('genres', size=len(movie_meta[2]['dict'])) -user_id = data_layer('user_id', size=user_meta[0]['max']) -gender = data_layer('gender', size=len(user_meta[1]['dict'])) -age = data_layer('age', size=len(user_meta[2]['dict'])) -occupation = data_layer('occupation', size=len(user_meta[3]['dict'])) - -embsize = 256 - -# construct movie feature -movie_id_emb = embedding_layer(input=movie_id, size=embsize) -movie_id_hidden = fc_layer(input=movie_id_emb, size=embsize) - -genres_emb = fc_layer(input=genres, size=embsize) - -title_emb = embedding_layer(input=title, size=embsize) -title_hidden = text_conv_pool( - input=title_emb, context_len=5, hidden_size=embsize) - -movie_feature = fc_layer( - input=[movie_id_hidden, title_hidden, genres_emb], size=embsize) - -# construct user feature -user_id_emb = embedding_layer(input=user_id, size=embsize) -user_id_hidden = fc_layer(input=user_id_emb, size=embsize) - -gender_emb = embedding_layer(input=gender, size=embsize) -gender_hidden = fc_layer(input=gender_emb, size=embsize) - -age_emb = embedding_layer(input=age, size=embsize) -age_hidden = fc_layer(input=age_emb, size=embsize) - -occup_emb = embedding_layer(input=occupation, size=embsize) -occup_hidden = fc_layer(input=occup_emb, size=embsize) - -user_feature = fc_layer( - input=[user_id_hidden, gender_hidden, age_hidden, occup_hidden], - size=embsize) - -similarity = cos_sim(a=movie_feature, b=user_feature, scale=2) - -if not is_predict: - lbl = data_layer('rating', size=1) - cost = regression_cost(input=similarity, label=lbl) - outputs(cost) - -else: - outputs(similarity)