From b069cead59bad5247c98c4da07a13e7c55386a14 Mon Sep 17 00:00:00 2001 From: Yao544303 Date: Fri, 2 Nov 2018 15:42:50 +0800 Subject: [PATCH] fix readme --- README.md | 51 ++-- ...50\350\215\220\347\263\273\347\273\237.md" | 86 +++++++ py3.x/RS-itemcf.py | 226 +++++++++++++++++ py3.x/RS-sklearn-rating.py | 189 ++++++++++++++ py3.x/RS-usercf.py | 237 ++++++++++++++++++ .../col_filtering => py3.x}/item_cf.py | 0 py3.x/python/Recommender.py | 28 +++ .../similarity_by_sklearn.py | 0 py3.x/sklearn-RS-demo-cf-item-test.py | 200 +++++++++++++++ py3.x/sklearn-RS-demo-item.py | 31 +++ py3.x/sklearn-RS-demo-user.py | 32 +++ py3.x/sklearn-RS-demo.py | 18 ++ py3.x/test_evaluation_model.py | 73 ++++++ py3.x/test_graph-based.py | 16 ++ py3.x/test_lfm.py | 43 ++++ ...72\344\272\216\347\211\251\345\223\201.py" | 65 +++++ ...72\344\272\216\347\224\250\346\210\267.py" | 80 ++++++ .../col_filtering => py3.x}/user_cf.py | 0 standalone/col_filtering/__init__.py | 0 .../metadata.json | 1 - .../output.pkl | Bin 1960 -> 0 bytes .../metadata.json | 1 - .../output.pkl | Bin 1960 -> 0 bytes .../get_data/func_code.py | 5 - 24 files changed, 1354 insertions(+), 28 deletions(-) create mode 100644 "manual/16.\346\216\250\350\215\220\347\263\273\347\273\237.md" create mode 100644 py3.x/RS-itemcf.py create mode 100644 py3.x/RS-sklearn-rating.py create mode 100644 py3.x/RS-usercf.py rename {standalone/col_filtering => py3.x}/item_cf.py (100%) create mode 100644 py3.x/python/Recommender.py rename {standalone/col_filtering => py3.x}/similarity_by_sklearn.py (100%) create mode 100644 py3.x/sklearn-RS-demo-cf-item-test.py create mode 100644 py3.x/sklearn-RS-demo-item.py create mode 100644 py3.x/sklearn-RS-demo-user.py create mode 100644 py3.x/sklearn-RS-demo.py create mode 100644 py3.x/test_evaluation_model.py create mode 100644 py3.x/test_graph-based.py create mode 100644 py3.x/test_lfm.py create mode 100644 "py3.x/test_\345\237\272\344\272\216\347\211\251\345\223\201.py" create mode 100644 "py3.x/test_\345\237\272\344\272\216\347\224\250\346\210\267.py" rename {standalone/col_filtering => py3.x}/user_cf.py (100%) delete mode 100644 standalone/col_filtering/__init__.py delete mode 100644 standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/5df639254df90ffb4b58eba85a36303c/metadata.json delete mode 100644 standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/5df639254df90ffb4b58eba85a36303c/output.pkl delete mode 100644 standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/9c0e4967ea839d4f938fcd3dc25572d0/metadata.json delete mode 100644 standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/9c0e4967ea839d4f938fcd3dc25572d0/output.pkl delete mode 100644 standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/func_code.py diff --git a/README.md b/README.md index eefe7b1..0cc2d15 100644 --- a/README.md +++ b/README.md @@ -1,40 +1,49 @@ # 项目介绍 -项亮的[《推荐系统实践》](https://book.douban.com/subject/10769749/)堪称是推荐系统的经典书籍。 -但因其成书时间较早,当时大数据相关技术并未如当下一样流行,故而书中使用的demo 代码,并不具备在大规模数据集上运行的条件。 -于是萌生了使用Spark,实现相关内容的念头。 +介绍推荐系统基本知识,相关算法以及实现。 # 目录规划 -* data 测试用数据集合 -* standalone 相关实践的单机实现版本(主要为python实现) -* spark 相关实践的spark版本(主要为scala实现) - 已经完成特征工程,ItemCF 部分 +* data 测试用数据集合 +* py3.x 相关实践的python 实现 +* spark 相关实践的spark 实现 * manual 相关资料集合 + * paper阅读分享 + * 基础知识分享 + + +# 内容导航 +## python 实现(主要用于原理理解) +* ItemCF(使用sklearn 版本和 不适用sklearn版本) +* UserCF(使用sklearn 版本和 不适用sklearn版本) +* LFM +* Graph—Based + +## Spark 实现 +* 特征工程 +* ItemCF # 计划项(恩 就是挖坑的意思) ## 推荐算实现 ### 基于用户行为数据的推荐算法 -ItemCF -UserCF -关联规则 -LFM -Graph -ALS +* 关联规则 +* LFM +* Graph +* ALS ### 利用用户标签数据推荐算法 -LDA -TFIDF -TagCF - -## 探索性研究(各个paper的实现) -Markov Chain -社交网络 +* LDA +* TFIDF +* TagCF + +### 探索性研究(各个paper的实现) +* Markov Chain +* 社交网络 +* 基于深度学习的推荐算法 .... ## 评价系统实现 - ## 推荐系统架构实现 ### 外围架构 #### 用户行为日志存储系统 diff --git "a/manual/16.\346\216\250\350\215\220\347\263\273\347\273\237.md" "b/manual/16.\346\216\250\350\215\220\347\263\273\347\273\237.md" new file mode 100644 index 0000000..18498a2 --- /dev/null +++ "b/manual/16.\346\216\250\350\215\220\347\263\273\347\273\237.md" @@ -0,0 +1,86 @@ +# 第16章 推荐系统 + +## 背景与挖掘目标 + +随着互联网的快速发展,用户很难快速从海量信息中寻找到自己感兴趣的信息。因此诞生了:搜索引擎+推荐系统 + +本章节-推荐系统: + +1. 帮助用户发现其感兴趣和可能感兴趣的信息。 +2. 让网站价值信息脱颖而出,得到广大用户的认可。 +3. 提高用户对网站的忠诚度和关注度,建立稳固用户群体。 + +## 分析方法与过程 + +本案例的目标是对用户进行推荐,即以一定的方式将用户与物品(本次指网页)之间建立联系。 + +由于用户访问网站的数据记录很多,如果不对数据进行分类处理,对所有的记录直接采用推荐系统进行推荐,这样会存在一下问题。 + +1. 数据量太大意味着物品数与用户数很多,在模型构建用户与物品稀疏矩阵时,出现设备内存空间不够的情况,并且模型计算需要消耗大量的时间。 +2. 用户区别很大,不同的用户关注的信息不一样,因此,即使能够得到推荐结果,其效果也会不好。 + +为了避免出现上述问题,需要进行分类处理与分析。 + +正常的情况下,需要对用户的兴趣爱好以及需求进行分类。 +因为在用户访问记录中,没有记录用户访问页面时间的长短,因此不容易判断用户兴趣爱好。 +因此,本文根据用户浏览的网页信息进行分析处理,主要采用以下方法处理:以用户浏览网页的类型进行分类,然后对每个类型中的内容进行推荐。 + +分析过程如下: + +* 从系统中获取用户访问网站的原始记录。 +* 对数据进行多维分析,包括用户访问内容,流失用户分析以及用户分类等分析。 +* 对数据进行预处理,包含数据去重、数据变换和数据分类鞥处理过程。 +* 以用户访问html后缀的页面为关键条件,对数据进行处理。 +* 对比多种推荐算法进行推荐,通过模型评价,得到比较好的智能推荐模型。通过模型对样本数据进行预测,获得推荐结果。 + + + +## 主流推荐算法 + +| 推荐方法 | 描述 | +| --- | --- | +| 基于内容推荐 | | +| 协同过滤推荐 | | +| 基于规则推荐 | | +| 基于效用推荐 | | +| 基于知识推荐 | | +| 组合推荐 | | + +![推荐方法对比](/img/ml/16.RecommendedSystem/推荐方法对比.png) + +### 基于知识推荐 + +基于知识的推荐(Knowledge-based Recommendation)在某种程度是可以看成是一种推理(Inference)技术,它不是建立在用户需要和偏好基础上推荐的。基于知识的方法因它们所用的功能知识不同而有明显区别。效用知识(Functional Knowledge)是一种关于一个项目如何满足某一特定用户的知识,因此能解释需要和推荐的关系,所以用户资料可以是任何能支持推理的知识结构,它可以是用户已经规范化的查询,也可以是一个更详细的用户需要的表示。 + +![基于知识的推荐](/img/ml/16.RecommendedSystem/基于知识的推荐.jpg) + +### 协同过滤推荐 + +* memory-based推荐 + * Item-based方法 + * User-based方法 + * Memory-based推荐方法通过执行最近邻搜索,把每一个Item或者User看成一个向量,计算其他所有Item或者User与它的相似度。有了Item或者User之间的两两相似度之后,就可以进行预测与推荐了。 +* model-based推荐 + * Model-based推荐最常见的方法为Matrix factorization. + * 矩阵分解通过把原始的评分矩阵R分解为两个矩阵相乘,并且只考虑有评分的值,训练时不考虑missing项的值。R矩阵分解成为U与V两个矩阵后,评分矩阵R中missing的值就可以通过U矩阵中的某列和V矩阵的某行相乘得到 + * 矩阵分解的目标函数: U矩阵与V矩阵的可以通过梯度下降(gradient descent)算法求得,通过交替更新u与v多次迭代收敛之后可求出U与V。 + * 矩阵分解背后的核心思想,找到两个矩阵,它们相乘之后得到的那个矩阵的值,与评分矩阵R中有值的位置中的值尽可能接近。这样一来,分解出来的两个矩阵相乘就尽可能还原了评分矩阵R,因为有值的地方,值都相差得尽可能地小,那么missing的值通过这样的方式计算得到,比较符合趋势。 +* 协同过滤中主要存在如下两个问题:稀疏性与冷启动问题。已有的方案通常会通过引入多个不同的数据源或者辅助信息(Side information)来解决这些问题,用户的Side information可以是用户的基本个人信息、用户画像信息等,而Item的Side information可以是物品的content信息等。 + +## 效果评估 + +1. 召回率和准确率 【人为统计分析】 +2. F值(P-R曲线) 【偏重:非均衡问题】 +3. ROC和AUC 【偏重:不同结果的对比】 + +* * * + +* **作者:[片刻](http://www.apache.wiki/display/~jiangzhonglian)** +* [GitHub地址](https://github.com/apachecn/AiLearning): +* **版权声明:欢迎转载学习 => 请标注信息来源于 [ApacheCN](http://www.apachecn.org/)** + +> 摘录的原文地址: + +* [推荐系统中常用算法 以及优点缺点对比](http://www.36dsj.com/archives/9519) +* [推荐算法的基于知识推荐](https://zhidao.baidu.com/question/2013524494179442228.html) +* [推荐系统中基于深度学习的混合协同过滤模型](http://www.iteye.com/news/32100) diff --git a/py3.x/RS-itemcf.py b/py3.x/RS-itemcf.py new file mode 100644 index 0000000..a6c757b --- /dev/null +++ b/py3.x/RS-itemcf.py @@ -0,0 +1,226 @@ +#!/usr/bin/python +# coding:utf8 +''' +Created on 2015-06-22 +Update on 2017-05-16 +Author: Lockvictor/片刻 +《推荐系统实践》协同过滤算法源代码 +参考地址:https://github.com/Lockvictor/MovieLens-RecSys +更新地址:https://github.com/apachecn/AiLearning +''' +import sys +import math +import random +from operator import itemgetter + +# 作用:使得随机数据可预测 +random.seed(0) + + +class ItemBasedCF(): + ''' TopN recommendation - ItemBasedCF ''' + + def __init__(self): + self.trainset = {} + self.testset = {} + + # n_sim_user: top 20个用户, n_rec_movie: top 10个推荐结果 + self.n_sim_movie = 20 + self.n_rec_movie = 10 + + # user_sim_mat: 电影之间的相似度, movie_popular: 电影的出现次数, movie_count: 总电影数量 + self.movie_sim_mat = {} + self.movie_popular = {} + self.movie_count = 0 + + print >> sys.stderr, 'Similar movie number = %d' % self.n_sim_movie + print >> sys.stderr, 'Recommended movie number = %d' % self.n_rec_movie + + @staticmethod + def loadfile(filename): + """loadfile(加载文件,返回一个生成器) + + Args: + filename 文件名 + Returns: + line 行数据,去空格 + """ + fp = open(filename, 'r') + for i, line in enumerate(fp): + yield line.strip('\r\n') + if i > 0 and i % 100000 == 0: + print >> sys.stderr, 'loading %s(%s)' % (filename, i) + fp.close() + print >> sys.stderr, 'load %s success' % filename + + def generate_dataset(self, filename, pivot=0.7): + """loadfile(加载文件,将数据集按照7:3 进行随机拆分) + + Args: + filename 文件名 + pivot 拆分比例 + """ + trainset_len = 0 + testset_len = 0 + + for line in self.loadfile(filename): + # 用户ID,电影名称,评分,时间戳 + # user, movie, rating, _ = line.split('::') + user, movie, rating, _ = line.split('\t') + # 通过pivot和随机函数比较,然后初始化用户和对应的值 + if (random.random() < pivot): + + # dict.setdefault(key, default=None) + # key -- 查找的键值 + # default -- 键不存在时,设置的默认键值 + self.trainset.setdefault(user, {}) + self.trainset[user][movie] = int(rating) + trainset_len += 1 + else: + self.testset.setdefault(user, {}) + self.testset[user][movie] = int(rating) + testset_len += 1 + + print >> sys.stderr, '分离训练集和测试集成功' + print >> sys.stderr, 'train set = %s' % trainset_len + print >> sys.stderr, 'test set = %s' % testset_len + + def calc_movie_sim(self): + """calc_movie_sim(计算用户之间的相似度)""" + + print >> sys.stderr, 'counting movies number and popularity...' + + # 统计在所有的用户中,不同电影的总出现次数, user, movies + for _, movies in self.trainset.items(): + for movie in movies: + # count item popularity + if movie not in self.movie_popular: + self.movie_popular[movie] = 0 + self.movie_popular[movie] += 1 + + print >> sys.stderr, 'count movies number and popularity success' + + # save the total number of movies + self.movie_count = len(self.movie_popular) + print >> sys.stderr, 'total movie number = %d' % self.movie_count + + # 统计在相同用户时,不同电影同时出现的次数 + itemsim_mat = self.movie_sim_mat + print >> sys.stderr, 'building co-rated users matrix...' + # user, movies + for _, movies in self.trainset.items(): + for m1 in movies: + for m2 in movies: + if m1 == m2: + continue + itemsim_mat.setdefault(m1, {}) + itemsim_mat[m1].setdefault(m2, 0) + itemsim_mat[m1][m2] += 1 + print >> sys.stderr, 'build co-rated users matrix success' + + # calculate similarity matrix + print >> sys.stderr, 'calculating movie similarity matrix...' + simfactor_count = 0 + PRINT_STEP = 2000000 + for m1, related_movies in itemsim_mat.items(): + for m2, count in related_movies.iteritems(): + # 余弦相似度 + itemsim_mat[m1][m2] = count / math.sqrt( + self.movie_popular[m1] * self.movie_popular[m2]) + simfactor_count += 1 + # 打印进度条 + if simfactor_count % PRINT_STEP == 0: + print >> sys.stderr, 'calculating movie similarity factor(%d)' % simfactor_count + + print >> sys.stderr, 'calculate movie similarity matrix(similarity factor) success' + print >> sys.stderr, 'Total similarity factor number = %d' % simfactor_count + + # @profile + def recommend(self, user): + """recommend(找出top K的电影,对电影进行相似度sum的排序,取出top N的电影数) + + Args: + user 用户 + Returns: + rec_movie 电影推荐列表,按照相似度从大到小的排序 + """ + ''' Find K similar movies and recommend N movies. ''' + K = self.n_sim_movie + N = self.n_rec_movie + rank = {} + watched_movies = self.trainset[user] + + # 计算top K 电影的相似度 + # rating=电影评分, w=不同电影出现的次数 + # 耗时分析:98.2%的时间在 line-154行 + for movie, rating in watched_movies.iteritems(): + for related_movie, w in sorted( + self.movie_sim_mat[movie].items(), + key=itemgetter(1), + reverse=True)[0:K]: + if related_movie in watched_movies: + continue + rank.setdefault(related_movie, 0) + rank[related_movie] += w * rating + # return the N best movies + return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N] + + def evaluate(self): + ''' return precision, recall, coverage and popularity ''' + print >> sys.stderr, 'Evaluation start...' + + # 返回top N的推荐结果 + N = self.n_rec_movie + # varables for precision and recall + # hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数 + hit = 0 + rec_count = 0 + test_count = 0 + # varables for coverage + all_rec_movies = set() + # varables for popularity + popular_sum = 0 + + # enumerate将其组成一个索引序列,利用它可以同时获得索引和值 + # 参考地址:http://blog.csdn.net/churximi/article/details/51648388 + for i, user in enumerate(self.trainset): + if i > 0 and i % 500 == 0: + print >> sys.stderr, 'recommended for %d users' % i + test_movies = self.testset.get(user, {}) + rec_movies = self.recommend(user) + + # 对比测试集和推荐集的差异 movie, w + for movie, _ in rec_movies: + if movie in test_movies: + hit += 1 + all_rec_movies.add(movie) + # 计算用户对应的电影出现次数log值的sum加和 + popular_sum += math.log(1 + self.movie_popular[movie]) + rec_count += N + test_count += len(test_movies) + + precision = hit / (1.0 * rec_count) + recall = hit / (1.0 * test_count) + coverage = len(all_rec_movies) / (1.0 * self.movie_count) + popularity = popular_sum / (1.0 * rec_count) + + print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % ( + precision, recall, coverage, popularity) + + +if __name__ == '__main__': + # ratingfile = 'db/16.RecommenderSystems/ml-1m/ratings.dat' + ratingfile = 'db/16.RecommenderSystems/ml-100k/u.data' + + # 创建ItemCF对象 + itemcf = ItemBasedCF() + # 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset和testset中 + itemcf.generate_dataset(ratingfile, pivot=0.7) + # 计算用户之间的相似度 + itemcf.calc_movie_sim() + # 评估推荐效果 + # itemcf.evaluate() + # 查看推荐结果用户 + user = "2" + print("推荐结果", itemcf.recommend(user)) + print("---", itemcf.testset.get(user, {})) diff --git a/py3.x/RS-sklearn-rating.py b/py3.x/RS-sklearn-rating.py new file mode 100644 index 0000000..fea7ab0 --- /dev/null +++ b/py3.x/RS-sklearn-rating.py @@ -0,0 +1,189 @@ +#!/usr/bin/python +# coding:utf8 + +import sys +import math +from operator import itemgetter + +import numpy as np +import pandas as pd +from scipy.sparse.linalg import svds +from sklearn import cross_validation as cv +from sklearn.metrics import mean_squared_error +from sklearn.metrics.pairwise import pairwise_distances + + +def splitData(dataFile, test_size): + # 加载数据集 + header = ['user_id', 'item_id', 'rating', 'timestamp'] + df = pd.read_csv(dataFile, sep='\t', names=header) + + n_users = df.user_id.unique().shape[0] + n_items = df.item_id.unique().shape[0] + + print('Number of users = ' + str(n_users) + ' | Number of movies = ' + + str(n_items)) + train_data, test_data = cv.train_test_split(df, test_size=test_size) + print("数据量:", len(train_data), len(test_data)) + return df, n_users, n_items, train_data, test_data + + +def calc_similarity(n_users, n_items, train_data, test_data): + # 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵: + train_data_matrix = np.zeros((n_users, n_items)) + for line in train_data.itertuples(): + train_data_matrix[line[1] - 1, line[2] - 1] = line[3] + test_data_matrix = np.zeros((n_users, n_items)) + for line in test_data.itertuples(): + test_data_matrix[line[1] - 1, line[2] - 1] = line[3] + + # 使用sklearn的pairwise_distances函数来计算余弦相似性。 + print("1:", np.shape(train_data_matrix)) # 行:人,列:电影 + print("2:", np.shape(train_data_matrix.T)) # 行:电影,列:人 + + user_similarity = pairwise_distances(train_data_matrix, metric="cosine") + item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine") + + print >> sys.stderr, '开始统计流行item的数量...' + item_popular = {} + # 统计在所有的用户中,不同电影的总出现次数 + for i_index in range(n_items): + if np.sum(train_data_matrix[:, i_index]) != 0: + item_popular[i_index] = np.sum(train_data_matrix[:, i_index] != 0) + # print "pop=", i_index, self.item_popular[i_index] + + # save the total number of items + item_count = len(item_popular) + print >> sys.stderr, '总共流行item数量 = %d' % item_count + + return train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular + + +def predict(rating, similarity, type='user'): + print(type) + print("rating=", np.shape(rating)) + print("similarity=", np.shape(similarity)) + if type == 'user': + # 求出每一个用户,所有电影的综合评分(axis=0 表示对列操作, 1表示对行操作) + # print "rating=", np.shape(rating) + mean_user_rating = rating.mean(axis=1) + # np.newaxis参考地址: http://blog.csdn.net/xtingjie/article/details/72510834 + # print "mean_user_rating=", np.shape(mean_user_rating) + # print "mean_user_rating.newaxis=", np.shape(mean_user_rating[:, np.newaxis]) + rating_diff = (rating - mean_user_rating[:, np.newaxis]) + # print "rating=", rating[:3, :3] + # print "mean_user_rating[:, np.newaxis]=", mean_user_rating[:, np.newaxis][:3, :3] + # print "rating_diff=", rating_diff[:3, :3] + + # 均分 + 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分 + pred = mean_user_rating[:, np.newaxis] + similarity.dot( + rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T + elif type == 'item': + # 综合打分: 人-电影-评分(943, 1682)*电影-电影-距离(1682, 1682)=结果-人-电影(各个电影对同一电影的综合得分)(943, 1682) / 再除以 电影与其他电影总的距离 = 人-电影综合得分 + pred = rating.dot(similarity) / np.array( + [np.abs(similarity).sum(axis=1)]) + return pred + + +def rmse(prediction, ground_truth): + prediction = prediction[ground_truth.nonzero()].flatten() + ground_truth = ground_truth[ground_truth.nonzero()].flatten() + return math.sqrt(mean_squared_error(prediction, ground_truth)) + + +def evaluate(prediction, item_popular, name): + hit = 0 + rec_count = 0 + test_count = 0 + popular_sum = 0 + all_rec_items = set() + for u_index in range(n_users): + items = np.where(train_data_matrix[u_index, :] == 0)[0] + pre_items = sorted( + dict(zip(items, prediction[u_index, items])).items(), + key=itemgetter(1), + reverse=True)[:20] + test_items = np.where(test_data_matrix[u_index, :] != 0)[0] + + # 对比测试集和推荐集的差异 item, w + for item, _ in pre_items: + if item in test_items: + hit += 1 + all_rec_items.add(item) + + # 计算用户对应的电影出现次数log值的sum加和 + if item in item_popular: + popular_sum += math.log(1 + item_popular[item]) + + rec_count += len(pre_items) + test_count += len(test_items) + + precision = hit / (1.0 * rec_count) + recall = hit / (1.0 * test_count) + coverage = len(all_rec_items) / (1.0 * len(item_popular)) + popularity = popular_sum / (1.0 * rec_count) + print >> sys.stderr, '%s: precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % ( + name, precision, recall, coverage, popularity) + + +def recommend(u_index, prediction): + items = np.where(train_data_matrix[u_index, :] == 0)[0] + pre_items = sorted( + dict(zip(items, prediction[u_index, items])).items(), + key=itemgetter(1), + reverse=True)[:10] + test_items = np.where(test_data_matrix[u_index, :] != 0)[0] + + print('原始结果:', test_items) + print('推荐结果:', [key for key, value in pre_items]) + + +if __name__ == "__main__": + + # 基于内存的协同过滤 + # ... + # 拆分数据集 + # http://files.grouplens.org/datasets/movielens/ml-100k.zip + dataFile = 'db/16.RecommenderSystems/ml-100k/u.data' + df, n_users, n_items, train_data, test_data = splitData( + dataFile, test_size=0.25) + + # 计算相似度 + train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity( + n_users, n_items, train_data, test_data) + + item_prediction = predict(train_data_matrix, item_similarity, type='item') + user_prediction = predict(train_data_matrix, user_similarity, type='user') + + # 评估:均方根误差 + print( + 'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))) + print( + 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))) + + # 基于模型的协同过滤 + # ... + # 计算MovieLens数据集的稀疏度 (n_users,n_items 是常量,所以,用户行为数据越少,意味着信息量少;越稀疏,优化的空间也越大) + sparsity = round(1.0 - len(df) / float(n_users * n_items), 3) + print('The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%') + + # 计算稀疏矩阵的最大k个奇异值/向量 + u, s, vt = svds(train_data_matrix, k=15) + s_diag_matrix = np.diag(s) + svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt) + print("svd-shape:", np.shape(svd_prediction)) + print( + 'Model based CF RMSE: ' + str(rmse(svd_prediction, test_data_matrix))) + """ + 在信息量相同的情况下,矩阵越小,那么携带的信息越可靠。 + 所以:user-cf 推荐效果高于 item-cf; 而svd分解后,发现15个维度效果就能达到90%以上,所以信息更可靠,效果也更好。 + item-cf: 1682 + user-cf: 943 + svd: 15 + """ + evaluate(item_prediction, item_popular, 'item') + evaluate(user_prediction, item_popular, 'user') + evaluate(svd_prediction, item_popular, 'svd') + + # 推荐结果 + recommend(1, svd_prediction) diff --git a/py3.x/RS-usercf.py b/py3.x/RS-usercf.py new file mode 100644 index 0000000..644e2b7 --- /dev/null +++ b/py3.x/RS-usercf.py @@ -0,0 +1,237 @@ +#!/usr/bin/python +# coding:utf8 +''' +Created on 2015-06-22 +Update on 2017-05-16 +Author: Lockvictor/片刻 +《推荐系统实践》协同过滤算法源代码 +参考地址:https://github.com/Lockvictor/MovieLens-RecSys +更新地址:https://github.com/apachecn/AiLearning +''' +import sys +import math +import random +from operator import itemgetter +print(__doc__) +# 作用:使得随机数据可预测 +random.seed(0) + + +class UserBasedCF(): + ''' TopN recommendation - UserBasedCF ''' + + def __init__(self): + self.trainset = {} + self.testset = {} + + # n_sim_user: top 20个用户, n_rec_movie: top 10个推荐结果 + self.n_sim_user = 20 + self.n_rec_movie = 10 + + # user_sim_mat: 用户之间的相似度, movie_popular: 电影的出现次数, movie_count: 总电影数量 + self.user_sim_mat = {} + self.movie_popular = {} + self.movie_count = 0 + + print >> sys.stderr, 'similar user number = %d' % self.n_sim_user + print >> sys.stderr, 'recommended movie number = %d' % self.n_rec_movie + + @staticmethod + def loadfile(filename): + """loadfile(加载文件,返回一个生成器) + + Args: + filename 文件名 + Returns: + line 行数据,去空格 + """ + fp = open(filename, 'r') + for i, line in enumerate(fp): + yield line.strip('\r\n') + if i > 0 and i % 100000 == 0: + print >> sys.stderr, 'loading %s(%s)' % (filename, i) + fp.close() + print >> sys.stderr, 'load %s success' % filename + + def generate_dataset(self, filename, pivot=0.7): + """loadfile(加载文件,将数据集按照7:3 进行随机拆分) + + Args: + filename 文件名 + pivot 拆分比例 + """ + trainset_len = 0 + testset_len = 0 + + for line in self.loadfile(filename): + # 用户ID,电影名称,评分,时间戳timestamp + # user, movie, rating, timestamp = line.split('::') + user, movie, rating, _ = line.split('\t') + # 通过pivot和随机函数比较,然后初始化用户和对应的值 + if (random.random() < pivot): + + # dict.setdefault(key, default=None) + # key -- 查找的键值 + # default -- 键不存在时,设置的默认键值 + self.trainset.setdefault(user, {}) + self.trainset[user][movie] = int(rating) + trainset_len += 1 + else: + self.testset.setdefault(user, {}) + self.testset[user][movie] = int(rating) + testset_len += 1 + + print >> sys.stderr, '分离训练集和测试集成功' + print >> sys.stderr, 'train set = %s' % trainset_len + print >> sys.stderr, 'test set = %s' % testset_len + + def calc_user_sim(self): + """calc_user_sim(计算用户之间的相似度)""" + + # build inverse table for item-users + # key=movieID, value=list of userIDs who have seen this movie + print >> sys.stderr, 'building movie-users inverse table...' + movie2users = dict() + + # 同一个电影中,收集用户的集合 + # 统计在所有的用户中,不同电影的总出现次数 + for user, movies in self.trainset.items(): + for movie in movies: + # inverse table for item-users + if movie not in movie2users: + movie2users[movie] = set() + movie2users[movie].add(user) + # count item popularity at the same time + if movie not in self.movie_popular: + self.movie_popular[movie] = 0 + self.movie_popular[movie] += 1 + + print >> sys.stderr, 'build movie-users inverse table success' + + # save the total movie number, which will be used in evaluation + self.movie_count = len(movie2users) + print >> sys.stderr, 'total movie number = %d' % self.movie_count + + usersim_mat = self.user_sim_mat + # 统计在相同电影时,不同用户同时出现的次数 + print >> sys.stderr, 'building user co-rated movies matrix...' + + for movie, users in movie2users.items(): + for u in users: + for v in users: + if u == v: + continue + usersim_mat.setdefault(u, {}) + usersim_mat[u].setdefault(v, 0) + usersim_mat[u][v] += 1 + print >> sys.stderr, 'build user co-rated movies matrix success' + + # calculate similarity matrix + print >> sys.stderr, 'calculating user similarity matrix...' + simfactor_count = 0 + PRINT_STEP = 2000000 + for u, related_users in usersim_mat.items(): + for v, count in related_users.iteritems(): + # 余弦相似度 + usersim_mat[u][v] = count / math.sqrt( + len(self.trainset[u]) * len(self.trainset[v])) + simfactor_count += 1 + # 打印进度条 + if simfactor_count % PRINT_STEP == 0: + print >> sys.stderr, 'calculating user similarity factor(%d)' % simfactor_count + + print >> sys.stderr, 'calculate user similarity matrix(similarity factor) success' + print >> sys.stderr, 'Total similarity factor number = %d' % simfactor_count + + # @profile + def recommend(self, user): + """recommend(找出top K的用户,所看过的电影,对电影进行相似度sum的排序,取出top N的电影数) + + Args: + user 用户 + Returns: + rec_movie 电影推荐列表,按照相似度从大到小的排序 + """ + ''' Find K similar users and recommend N movies. ''' + K = self.n_sim_user + N = self.n_rec_movie + rank = dict() + watched_movies = self.trainset[user] + + # 计算top K 用户的相似度 + # v=similar user, wuv=不同用户同时出现的次数,根据wuv倒序从大到小选出K个用户进行排列 + # 耗时分析:50.4%的时间在 line-160行 + for v, wuv in sorted( + self.user_sim_mat[user].items(), key=itemgetter(1), + reverse=True)[0:K]: + for movie, rating in self.trainset[v].iteritems(): + if movie in watched_movies: + continue + # predict the user's "interest" for each movie + rank.setdefault(movie, 0) + rank[movie] += wuv * rating + # return the N best movies + """ + wuv + precision=0.3766 recall=0.0759 coverage=0.3183 popularity=6.9194 + + wuv * rating + precision=0.3865 recall=0.0779 coverage=0.2681 popularity=7.0116 + """ + return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N] + + def evaluate(self): + ''' return precision, recall, coverage and popularity ''' + print >> sys.stderr, 'Evaluation start...' + + # 返回top N的推荐结果 + N = self.n_rec_movie + # varables for precision and recall + # hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数 + hit = 0 + rec_count = 0 + test_count = 0 + # varables for coverage + all_rec_movies = set() + # varables for popularity + popular_sum = 0 + + # enumerate将其组成一个索引序列,利用它可以同时获得索引和值 + # 参考地址:http://blog.csdn.net/churximi/article/details/51648388 + for i, user in enumerate(self.trainset): + if i > 0 and i % 500 == 0: + print >> sys.stderr, 'recommended for %d users' % i + test_movies = self.testset.get(user, {}) + rec_movies = self.recommend(user) + + # 对比测试集和推荐集的差异 movie, w + for movie, _ in rec_movies: + if movie in test_movies: + hit += 1 + all_rec_movies.add(movie) + # 计算用户对应的电影出现次数log值的sum加和 + popular_sum += math.log(1 + self.movie_popular[movie]) + rec_count += N + test_count += len(test_movies) + + precision = hit / (1.0 * rec_count) + recall = hit / (1.0 * test_count) + coverage = len(all_rec_movies) / (1.0 * self.movie_count) + popularity = popular_sum / (1.0 * rec_count) + + print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % ( + precision, recall, coverage, popularity) + + +if __name__ == '__main__': + # ratingfile = 'db/16.RecommenderSystems/ml-1m/ratings.dat' + ratingfile = 'db/16.RecommenderSystems/ml-100k/u.data' + + # 创建UserCF对象 + usercf = UserBasedCF() + # 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset和testset中 + usercf.generate_dataset(ratingfile, pivot=0.7) + # 计算用户之间的相似度 + usercf.calc_user_sim() + # 评估推荐效果 + usercf.evaluate() diff --git a/standalone/col_filtering/item_cf.py b/py3.x/item_cf.py similarity index 100% rename from standalone/col_filtering/item_cf.py rename to py3.x/item_cf.py diff --git a/py3.x/python/Recommender.py b/py3.x/python/Recommender.py new file mode 100644 index 0000000..40acbb0 --- /dev/null +++ b/py3.x/python/Recommender.py @@ -0,0 +1,28 @@ +import numpy as np + + +# 自定义杰卡德相似系数函数,仅对0-1矩阵有效 +def Jaccard(a, b): + return 1.0*(a*b).sum()/(a+b-a*b).sum() + + +class Recommender(): + + # 相似度矩阵 + sim = None + + # 计算相似度矩阵的函数 + def similarity(self, x, distance): + y = np.ones((len(x), len(x))) + for i in range(len(x)): + for j in range(len(x)): + y[i, j] = distance(x[i], x[j]) + return y + + # 训练函数 + def fit(self, x, distance=Jaccard): + self.sim = self.similarity(x, distance) + + # 推荐函数 + def recommend(self, a): + return np.dot(self.sim, a)*(1-a) diff --git a/standalone/col_filtering/similarity_by_sklearn.py b/py3.x/similarity_by_sklearn.py similarity index 100% rename from standalone/col_filtering/similarity_by_sklearn.py rename to py3.x/similarity_by_sklearn.py diff --git a/py3.x/sklearn-RS-demo-cf-item-test.py b/py3.x/sklearn-RS-demo-cf-item-test.py new file mode 100644 index 0000000..e40d2ad --- /dev/null +++ b/py3.x/sklearn-RS-demo-cf-item-test.py @@ -0,0 +1,200 @@ +#!/usr/bin/python +# coding:utf8 +''' +Created on 2015-06-22 +Update on 2017-05-16 +Author: Lockvictor/片刻 +《推荐系统实践》协同过滤算法源代码 +参考地址:https://github.com/Lockvictor/MovieLens-RecSys +更新地址:https://github.com/apachecn/AiLearning +''' +import math +import random +import sys +from operator import itemgetter + +import numpy as np +import pandas as pd +from sklearn import cross_validation as cv +from sklearn.metrics.pairwise import pairwise_distances + +# 作用:使得随机数据可预测 +random.seed(0) + + +class ItemBasedCF(): + ''' TopN recommendation - ItemBasedCF ''' + + def __init__(self): + # 拆分数据集 + self.train_mat = {} + self.test_mat = {} + + # 总用户数 + self.n_users = 0 + self.n_items = 0 + + # n_sim_user: top 20个用户, n_rec_item: top 10个推荐结果 + self.n_sim_item = 20 + self.n_rec_item = 10 + + # item_mat_similarity: 电影之间的相似度, item_popular: 电影的出现次数, item_count: 总电影数量 + self.item_mat_similarity = {} + self.item_popular = {} + self.item_count = 0 + + print >> sys.stderr, 'Similar item number = %d' % self.n_sim_item + print >> sys.stderr, 'Recommended item number = %d' % self.n_rec_item + + def splitData(self, dataFile, test_size): + # 加载数据集 + header = ['user_id', 'item_id', 'rating', 'timestamp'] + df = pd.read_csv(dataFile, sep='\t', names=header) + + self.n_users = df.user_id.unique().shape[0] + self.n_items = df.item_id.unique().shape[0] + + print('Number of users = ' + str(self.n_users) + + ' | Number of items = ' + str(self.n_items)) + + # 拆分数据集: 用户+电影 + self.train_data, self.test_data = cv.train_test_split( + df, test_size=test_size) + print >> sys.stderr, '分离训练集和测试集成功' + print >> sys.stderr, 'len(train) = %s' % np.shape(self.train_data)[0] + print >> sys.stderr, 'len(test) = %s' % np.shape(self.test_data)[0] + + def calc_similarity(self): + # 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵: + self.train_mat = np.zeros((self.n_users, self.n_items)) + for line in self.train_data.itertuples(): + self.train_mat[int(line.user_id) - 1, + int(line.item_id) - 1] = float(line.rating) + self.test_mat = np.zeros((self.n_users, self.n_items)) + for line in self.test_data.itertuples(): + # print "line", line.user_id-1, line.item_id-1, line.rating + self.test_mat[int(line.user_id) - 1, + int(line.item_id) - 1] = float(line.rating) + + # 使用sklearn的pairwise_distances函数来计算余弦相似性。 + print("1:", np.shape(np.mat(self.train_mat).T)) # 行:电影,列:人 + # 电影-电影-距离(1682, 1682) + self.item_mat_similarity = pairwise_distances( + np.mat(self.train_mat).T, metric='cosine') + print >> sys.stderr, 'item_mat_similarity=', np.shape( + self.item_mat_similarity) + + print >> sys.stderr, '开始统计流行item的数量...' + + # 统计在所有的用户中,不同电影的总出现次数 + for i_index in range(self.n_items): + if np.sum(self.train_mat[:, i_index]) != 0: + self.item_popular[i_index] = np.sum( + self.train_mat[:, i_index] != 0) + # print "pop=", i_index, self.item_popular[i_index] + + # save the total number of items + self.item_count = len(self.item_popular) + print >> sys.stderr, '总共流行item数量 = %d' % self.item_count + + # @profile + def recommend(self, u_index): + """recommend(找出top K的电影,对电影进行相似度sum的排序,取出top N的电影数) + + Args: + u_index 用户_ID-1=用户index + Returns: + rec_item 电影推荐列表,按照相似度从大到小的排序 + """ + ''' Find K similar items and recommend N items. ''' + K = self.n_sim_item + N = self.n_rec_item + rank = {} + i_items = np.where(self.train_mat[u_index, :] != 0)[0] + # print "i_items=", i_items + watched_items = dict(zip(i_items, self.train_mat[u_index, i_items])) + + # 计算top K 电影的相似度 + # rating=电影评分, w=不同电影出现的次数 + # 耗时分析:98.2%的时间在 line-154行 + for i_item, rating in watched_items.items(): + i_other_items = np.where( + self.item_mat_similarity[i_item, :] != 0)[0] + for related_item, w in sorted( + dict( + zip(i_other_items, self.item_mat_similarity[ + i_item, i_other_items])).items(), + key=itemgetter(1), + reverse=True)[0:K]: + if related_item in watched_items: + continue + rank.setdefault(related_item, 0) + rank[related_item] += w * rating + + # return the N best items + return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N] + + def evaluate(self): + ''' return precision, recall, coverage and popularity ''' + print >> sys.stderr, 'Evaluation start...' + + # varables for precision and recall + # hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数 + hit = 0 + rec_count = 0 + test_count = 0 + # varables for coverage + all_rec_items = set() + # varables for popularity + popular_sum = 0 + + # enumerate 将其组成一个索引序列,利用它可以同时获得索引和值 + # 参考地址:http://blog.csdn.net/churximi/article/details/51648388 + for u_index in range(50): + if u_index > 0 and u_index % 10 == 0: + print >> sys.stderr, 'recommended for %d users' % u_index + print("u_index", u_index) + + # 对比测试集和推荐集的差异 + rec_items = self.recommend(u_index) + print("rec_items=", rec_items) + # item, w + for item, _ in rec_items: + # print 'test_mat[u_index, item]=', item, self.test_mat[u_index, item] + + if self.test_mat[u_index, item] != 0: + hit += 1 + print("self.test_mat[%d, %d]=%s" % + (u_index, item, self.test_mat[u_index, item])) + # 计算用户对应的电影出现次数log值的sum加和 + if item in self.item_popular: + popular_sum += math.log(1 + self.item_popular[item]) + + rec_count += len(rec_items) + test_count += np.sum(self.test_mat[u_index, :] != 0) + # print "test_count=", np.sum(self.test_mat[u_index, :] != 0), np.sum(self.train_mat[u_index, :] != 0) + + print("-------", hit, rec_count) + precision = hit / (1.0 * rec_count) + recall = hit / (1.0 * test_count) + coverage = len(all_rec_items) / (1.0 * self.item_count) + popularity = popular_sum / (1.0 * rec_count) + + print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % ( + precision, recall, coverage, popularity) + + +if __name__ == '__main__': + dataFile = 'db/16.RecommenderSystems/ml-100k/u.data' + + # 创建ItemCF对象 + itemcf = ItemBasedCF() + # 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset和testset中 + itemcf.splitData(dataFile, test_size=0.3) + # 计算用户之间的相似度 + itemcf.calc_similarity() + # 评估推荐效果 + # itemcf.evaluate() + # 查看推荐结果用户 + print("推荐结果", itemcf.recommend(u_index=1)) + print("---", np.where(itemcf.test_mat[1, :] != 0)[0]) diff --git a/py3.x/sklearn-RS-demo-item.py b/py3.x/sklearn-RS-demo-item.py new file mode 100644 index 0000000..8564827 --- /dev/null +++ b/py3.x/sklearn-RS-demo-item.py @@ -0,0 +1,31 @@ +#!/usr/bin/python +# coding:utf8 + +import numpy as np +from sklearn.decomposition import NMF +import matplotlib.pyplot as plt + +RATE_MATRIX = np.array([[5, 5, 3, 0, 5, 5], [5, 0, 4, 0, 4, 4], + [0, 3, 0, 5, 4, 5], [5, 4, 3, 3, 5, 5]]) + +nmf = NMF(n_components=2) +user_distribution = nmf.fit_transform(RATE_MATRIX) +item_distribution = nmf.components_ + +item_distribution = item_distribution.T +plt.plot(item_distribution[:, 0], item_distribution[:, 1], "b*") +plt.xlim((-1, 3)) +plt.ylim((-1, 3)) + +plt.title(u'the distribution of items (NMF)') +count = 1 +for item in item_distribution: + plt.text( + item[0], + item[1], + 'item ' + str(count), + bbox=dict(facecolor='red', alpha=0.2), + ) + count += 1 + +plt.show() diff --git a/py3.x/sklearn-RS-demo-user.py b/py3.x/sklearn-RS-demo-user.py new file mode 100644 index 0000000..373d091 --- /dev/null +++ b/py3.x/sklearn-RS-demo-user.py @@ -0,0 +1,32 @@ +#!/usr/bin/python +# coding:utf8 + +import numpy as np +from sklearn.decomposition import NMF +import matplotlib.pyplot as plt + +RATE_MATRIX = np.array([[5, 5, 3, 0, 5, 5], [5, 0, 4, 0, 4, 4], + [0, 3, 0, 5, 4, 5], [5, 4, 3, 3, 5, 5]]) + +nmf = NMF(n_components=2) +user_distribution = nmf.fit_transform(RATE_MATRIX) +item_distribution = nmf.components_ + +users = ['Ben', 'Tom', 'John', 'Fred'] +zip_data = zip(users, user_distribution) + +plt.title(u'the distribution of users (NMF)') +plt.xlim((-1, 3)) +plt.ylim((-1, 4)) +for item in zip_data: + user_name = item[0] + data = item[1] + plt.plot(data[0], data[1], "b*") + plt.text( + data[0], + data[1], + user_name, + bbox=dict(facecolor='red', alpha=0.2), + ) + +plt.show() diff --git a/py3.x/sklearn-RS-demo.py b/py3.x/sklearn-RS-demo.py new file mode 100644 index 0000000..e421242 --- /dev/null +++ b/py3.x/sklearn-RS-demo.py @@ -0,0 +1,18 @@ +#!/usr/bin/python +# coding:utf8 + +import numpy as np +from sklearn.decomposition import NMF +import matplotlib.pyplot as plt + +RATE_MATRIX = np.array([[5, 5, 3, 0, 5, 5], [5, 0, 4, 0, 4, 4], + [0, 3, 0, 5, 4, 5], [5, 4, 3, 3, 5, 5]]) + +nmf = NMF(n_components=2) # 设有2个隐主题 +user_distribution = nmf.fit_transform(RATE_MATRIX) +item_distribution = nmf.components_ + +print('用户的主题分布:') +print(user_distribution) +print('物品的主题分布:') +print(item_distribution) diff --git a/py3.x/test_evaluation_model.py b/py3.x/test_evaluation_model.py new file mode 100644 index 0000000..164a41b --- /dev/null +++ b/py3.x/test_evaluation_model.py @@ -0,0 +1,73 @@ +import math +import random + +def SplitData(data, M, k, seed): + test = [] + train = [] + random.seed(seed) + for user, item in data: + if random.randint(0, M) == k: + test.append([user, item]) + else: + train.append([user, item]) + return train, test + + +# 准确率 +def Precision(train, test, N): + hit = 0 + all = 0 + for user in train.keys(): + tu = test[user] + rank = GetRecommendation(user, N) + for item, pui in rank: + if item in tu: + hit += 1 + all += N + return hit / (all * 1.0) + + +# 召回率 +def Recall(train, test, N): + hit = 0 + all = 0 + for user in train.keys(): + tu = test[user] + rank = GetRecommendation(user, N) + for item, pui in rank: + if item in tu: + hit += 1 + all += len(tu) + return hit / (all * 1.0) + + +# 覆盖率 +def Coverage(train, test, N): + recommend_items = set() + all_items = set() + for user in train.keys(): + for item in train[user].keys(): + all_items.add(item) + rank = GetRecommendation(user, N) + for item, pui in rank: + recommend_items.add(item) + return len(recommend_items) / (len(all_items) * 1.0) + + +# 新颖度 +def Popularity(train, test, N): + item_popularity = dict() + for user, items in train.items(): + for item in items.keys(): + if item not in item_popularity: + item_popularity[item] = 0 + item_popularity[item] += 1 + ret = 0 + n = 0 + for user in train.keys(): + rank = GetRecommendation(user, N) + for item, pui in rank: + ret += math.log(1 + item_popularity[item]) + n += 1 + ret /= n * 1.0 + return ret diff --git a/py3.x/test_graph-based.py b/py3.x/test_graph-based.py new file mode 100644 index 0000000..72f6282 --- /dev/null +++ b/py3.x/test_graph-based.py @@ -0,0 +1,16 @@ +def PersonalRank(G, alpha, root): + rank = dict() + rank = {x: 0 for x in G.keys()} + rank[root] = 1 + for _ in range(20): + tmp = {x: 0 for x in G.keys()} + for i, ri in G.items(): + # j, wij + for j, _ in ri.items(): + if j not in tmp: + tmp[j] = 0 + tmp[j] += 0.6 * rank[i] / (1.0 * len(ri)) + if j == root: + tmp[j] += 1 - alpha + rank = tmp + return rank diff --git a/py3.x/test_lfm.py b/py3.x/test_lfm.py new file mode 100644 index 0000000..7bbe35b --- /dev/null +++ b/py3.x/test_lfm.py @@ -0,0 +1,43 @@ +import random + + +# 负样本采样过程 +def RandomSelectNegativeSample(self, items): + ret = dict() + for i in items.keys(): + ret[i] = 1 + + n = 0 + for i in range(0, len(items) * 3): + item = items_pool[random.randint(0, len(items_pool) - 1)] + if item in ret: + continue + ret[item] = 0 + n += 1 + if n > len(items): + break + return ret + + +def LatentFactorModel(user_items, F, N, alpha, _lambda): + [P, Q] = InitModel(user_items, F) + for step in range(0, N): + for user, items in user_items.items(): + samples = RandSelectNegativeSamples(items) + for item, rui in samples.items(): + eui = rui - Predict(user, item) + for f in range(0, F): + P[user][f] += alpha * (eui * Q[item][f] - _lambda * P[user][f]) + Q[item][f] += alpha * (eui * P[user][f] - _lambda * Q[item][f]) + alpha *= 0.9 + + +def Recommend(user, P, Q): + rank = dict() + for f, puf in P[user].items(): + for i, qfi in Q[f].items(): + if i not in rank: + rank[i] += puf * qfi + return rank + + diff --git "a/py3.x/test_\345\237\272\344\272\216\347\211\251\345\223\201.py" "b/py3.x/test_\345\237\272\344\272\216\347\211\251\345\223\201.py" new file mode 100644 index 0000000..d7c93b8 --- /dev/null +++ "b/py3.x/test_\345\237\272\344\272\216\347\211\251\345\223\201.py" @@ -0,0 +1,65 @@ +import math +from operator import itemgetter + + +def ItemSimilarity1(train): + #calculate co-rated users between items + C = dict() + N = dict() + for u, items in train.items(): + for i in users: + N[i] += 1 + for j in users: + if i == j: + continue + C[i][j] += 1 + + #calculate finial similarity matrix W + W = dict() + for i,related_items in C.items(): + for j, cij in related_items.items(): + W[u][v] = cij / math.sqrt(N[i] * N[j]) + return W + + +def ItemSimilarity2(train): + #calculate co-rated users between items + C = dict() + N = dict() + for u, items in train.items(): + for i in users: + N[i] += 1 + for j in users: + if i == j: + continue + C[i][j] += 1 / math.log(1 + len(items) * 1.0) + + #calculate finial similarity matrix W + W = dict() + for i,related_items in C.items(): + for j, cij in related_items.items(): + W[u][v] = cij / math.sqrt(N[i] * N[j]) + return W + + +def Recommendation1(train, user_id, W, K): + rank = dict() + ru = train[user_id] + for i,pi in ru.items(): + for j, wj in sorted(W[i].items(), key=itemgetter(1), reverse=True)[0:K]: + if j in ru: + continue + rank[j] += pi * wj + return rank + + +def Recommendation2(train, user_id, W, K): + rank = dict() + ru = train[user_id] + for i,pi in ru.items(): + for j, wj in sorted(W[i].items(), key=itemgetter(1), reverse=True)[0:K]: + if j in ru: + continue + rank[j].weight += pi * wj + rank[j].reason[i] = pi * wj + return rank diff --git "a/py3.x/test_\345\237\272\344\272\216\347\224\250\346\210\267.py" "b/py3.x/test_\345\237\272\344\272\216\347\224\250\346\210\267.py" new file mode 100644 index 0000000..b3341a6 --- /dev/null +++ "b/py3.x/test_\345\237\272\344\272\216\347\224\250\346\210\267.py" @@ -0,0 +1,80 @@ +import math +from operator import itemgetter + +def UserSimilarity1(train): + W = dict() + for u in train.keys(): + for v in train.keys(): + if u == v: + continue + W[u][v] = len(train[u] & train[v]) + W[u][v] /= math.sqrt(len(train[u]) * len(train[v]) * 1.0) + return W + + +def UserSimilarity2(train): + # build inverse table for item_users + item_users = dict() + for u, items in train.items(): + for i in items.keys(): + if i not in item_users: + item_users[i] = set() + item_users[i].add(u) + + #calculate co-rated items between users + C = dict() + N = dict() + for i, users in item_users.items(): + for u in users: + N[u] += 1 + for v in users: + if u == v: + continue + C[u][v] += 1 + + #calculate finial similarity matrix W + W = dict() + for u, related_users in C.items(): + for v, cuv in related_users.items(): + W[u][v] = cuv / math.sqrt(N[u] * N[v]) + return W + + +def UserSimilarity3(train): + # build inverse table for item_users + item_users = dict() + for u, items in train.items(): + for i in items.keys(): + if i not in item_users: + item_users[i] = set() + item_users[i].add(u) + + #calculate co-rated items between users + C = dict() + N = dict() + for i, users in item_users.items(): + for u in users: + N[u] += 1 + for v in users: + if u == v: + continue + C[u][v] += 1 / math.log(1 + len(users)) + + #calculate finial similarity matrix W + W = dict() + for u, related_users in C.items(): + for v, cuv in related_users.items(): + W[u][v] = cuv / math.sqrt(N[u] * N[v]) + return W + + +def Recommend(user, train, W): + rank = dict() + interacted_items = train[user] + for v, wuv in sorted(W[u].items, key=itemgetter(1), reverse=True)[0:K]: + for i, rvi in train[v].items: + if i in interacted_items: + #we should filter items user interacted before + continue + rank[i] += wuv * rvi + return rank diff --git a/standalone/col_filtering/user_cf.py b/py3.x/user_cf.py similarity index 100% rename from standalone/col_filtering/user_cf.py rename to py3.x/user_cf.py diff --git a/standalone/col_filtering/__init__.py b/standalone/col_filtering/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/5df639254df90ffb4b58eba85a36303c/metadata.json b/standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/5df639254df90ffb4b58eba85a36303c/metadata.json deleted file mode 100644 index 7a2b92f..0000000 --- a/standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/5df639254df90ffb4b58eba85a36303c/metadata.json +++ /dev/null @@ -1 +0,0 @@ -{"duration": 0.02300095558166504, "input_args": {"filename": "'C:/dtworkspace/RecommenderSystems/data/ratingslibsvm'"}} \ No newline at end of file diff --git a/standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/5df639254df90ffb4b58eba85a36303c/output.pkl b/standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/5df639254df90ffb4b58eba85a36303c/output.pkl deleted file mode 100644 index cdac7ea1e07b5d576c02dd65baa0fe5535e5a43b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1960 zcmai#OK1~O6ow~F)5d8{Vq2|`YWmPNx`QYK|zBJs#2a z>Af%%%K3t6xxB}8FBUB(%4w_SS)$B}Zu&wcS1jbxvGlf6V-B_$#9kT=@B$+P1Uc7CSUIxDk5(Q9@9f`9_4Z0 zmiOuyJ<+b2`b<+cm|$>SKu{}b-sfx40Zy{ zf|FnoEP)l!7gm%exDVa|c?WSY3`Rf_jDZO-2~r>pron!27d!+{!3(eq-h&nJ1;}~h zAPJ_x9w2AWfCX?C$TQ?Y4LksEz$*9weu7^>o?#M9gJVFR!vM?R1NaD%n12BpRlCpb*Ls?Kn;o;`cETRA<Mdb*e-DPIbsmaa4yS*{L7Jlb`GqN9Pa5Q9Q+w4B9D=>XM{5l9W$= z($|s{AFLD1BR|E{xkwNCgZUIsacfD+Un`F4(i|i^<dYcfrT+jPzXJ#W diff --git a/standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/9c0e4967ea839d4f938fcd3dc25572d0/metadata.json b/standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/9c0e4967ea839d4f938fcd3dc25572d0/metadata.json deleted file mode 100644 index 378412a..0000000 --- a/standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/9c0e4967ea839d4f938fcd3dc25572d0/metadata.json +++ /dev/null @@ -1 +0,0 @@ -{"duration": 0.03000164031982422, "input_args": {"filename": "'../../data/ratingslibsvm'"}} \ No newline at end of file diff --git a/standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/9c0e4967ea839d4f938fcd3dc25572d0/output.pkl b/standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/9c0e4967ea839d4f938fcd3dc25572d0/output.pkl deleted file mode 100644 index cdac7ea1e07b5d576c02dd65baa0fe5535e5a43b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1960 zcmai#OK1~O6ow~F)5d8{Vq2|`YWmPNx`QYK|zBJs#2a z>Af%%%K3t6xxB}8FBUB(%4w_SS)$B}Zu&wcS1jbxvGlf6V-B_$#9kT=@B$+P1Uc7CSUIxDk5(Q9@9f`9_4Z0 zmiOuyJ<+b2`b<+cm|$>SKu{}b-sfx40Zy{ zf|FnoEP)l!7gm%exDVa|c?WSY3`Rf_jDZO-2~r>pron!27d!+{!3(eq-h&nJ1;}~h zAPJ_x9w2AWfCX?C$TQ?Y4LksEz$*9weu7^>o?#M9gJVFR!vM?R1NaD%n12BpRlCpb*Ls?Kn;o;`cETRA<Mdb*e-DPIbsmaa4yS*{L7Jlb`GqN9Pa5Q9Q+w4B9D=>XM{5l9W$= z($|s{AFLD1BR|E{xkwNCgZUIsacfD+Un`F4(i|i^<dYcfrT+jPzXJ#W diff --git a/standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/func_code.py b/standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/func_code.py deleted file mode 100644 index 4a556e9..0000000 --- a/standalone/col_filtering/mycache/joblib/__main__-C%3A-dtworkspace-RecommenderSystems-standalone-col_filtering-similarity_by_sklearn/get_data/func_code.py +++ /dev/null @@ -1,5 +0,0 @@ -# first line: 24 -@mem.cache -def get_data(filename): - data = load_svmlight_file(filename) - return data[0], data[1] -- GitLab