提交 55ff5ea4 编写于 作者: Y Yao544303

init

上级 4967e7c8
*.class *.class
*.log *.log
/spark/target
/*/.idea
/spark/derby.log
/sparl/*.iml
/standalone/mycache
\ No newline at end of file
此差异已折叠。
{
"resources": {
"local_catch_file": "log/cache.txt",
"local_schema_file": "conf/ad_data.avsc"
},
"kafka":{
"topics": "datasys",
"consumer_group": "datasys_kafka2hive",
"brokerHosts": "zk1.common.ad.m.com:9092,zk2.common.ad.m.com:9092,zk3.common.ad.m.com:9092",
"zkHosts": "zk1.common.ad.m.com:2181,zk2.common.ad.m.com:2181,zk3.common.ad.m.com:2181",
"message_num": 400,
"root_dir": "/var/tmp"
},
"hdfs": {
"hdfs_name": "hadoopuser",
"hdfs_port": 50070,
"hdfs_host": "BJSH-ADHBASE-134-128.meitu-inc.com",
"hdfs_path": "/user/hadoopuser/jiangzl"
},
"hive": {
"hive_port": 10000,
"hive_host": "BJSH-ADHBASE-134-128.meitu-inc.com"
}
}
\ No newline at end of file
1::661::3::978302109
1::914::3::978301968
1::594::4::978302268
1::919::4::978301368
1::595::5::978824268
1::938::4::978301752
1::720::3::978300760
1::527::5::978824195
1::48::5::978824351
1::745::3::978824268
1::588::4::978824268
1::783::4::978824291
1::150::5::978301777
1::1::5::978824268
1::260::4::978300760
1::531::4::978302149
1::608::4::978301398
2::647::3::978299351
2::648::4::978299913
2::434::2::978300174
2::292::3::978300123
2::902::2::978298905
2::368::4::978300002
2::110::5::978298625
2::589::4::978299773
2::982::4::978299269
2::515::5::978298542
2::442::3::978300025
2::265::4::978299026
2::480::5::978299809
2::590::5::978299083
2::736::4::978300100
2::593::5::978298517
2::95::2::978300143
2::235::3::978299351
2::163::4::978299809
2::21::1::978299839
2::165::3::978300002
2::380::5::978299809
2::349::4::978299839
2::457::4::978299773
2::920::5::978298775
2::459::3::978300002
2::780::3::978299966
2::498::3::978299418
2::318::5::978298413
2::356::5::978299686
3::648::3::978297867
3::104::4::978298486
3::653::4::978297757
3::260::5::978297512
3::552::4::978297837
3::480::4::978297690
3::733::5::978297757
3::590::4::978297439
3::593::3::978297018
4::260::5::978294199
4::480::4::978294008
5::39::3::978245037
5::288::2::978246585
5::860::2::978244493
5::866::4::978245334
5::215::3::978245422
5::501::1::978244001
5::506::4::978245999
5::509::4::978245829
5::41::4::978244692
5::47::3::978245334
5::296::4::978244177
5::581::3::978244808
5::728::4::978244759
5::299::3::978242934
5::150::2::978245763
5::224::3::978245829
5::229::3::978246528
5::6::2::978245916
5::515::4::978245891
5::800::2::978244540
5::50::5::978244205
5::52::2::978246479
5::733::1::978245763
5::377::4::978245999
5::593::4::978244177
5::162::4::978244624
5::968::3::978242847
5::896::4::978244493
5::318::3::978244177
5::176::4::978244568
5::461::3::978244893
5::608::4::978244177
5::321::3::978245863
5::908::4::978241072
5::16::3::978245645
5::265::3::978245037
5::194::3::978246108
5::551::4::978246504
5::913::5::978242740
5::919::4::978241072
5::412::2::978245891
5::994::5::978244540
5::272::3::978245487
5::24::1::978242934
5::348::4::978245863
5::29::5::978245065
5::562::4::978244603
5::497::3::978245687
5::202::2::978246033
5::353::2::978246504
5::32::4::978244962
5::34::4::978244603
5::356::1::978241112
5::357::2::978245829
5::36::3::978244808
5::714::4::978244493
4 260:5 480:4
2 21:1 95:2 110:5 163:4 165:3 235:3 265:4 292:3 318:5 349:4 356:5 368:4 380:5 434:2 442:3 457:4 459:3 480:5 498:3 515:5 589:4 590:5 593:5 647:3 648:4 736:4 780:3 902:2 920:5 982:4
5 6:2 16:3 24:1 29:5 32:4 34:4 36:3 39:3 41:4 47:3 50:5 52:2 150:2 162:4 176:4 194:3 202:2 215:3 224:3 229:3 265:3 272:3 288:2 296:4 299:3 318:3 321:3 348:4 353:2 356:1 357:2 377:4 412:2 461:3 497:3 501:1 506:4 509:4 515:4 551:4 562:4 581:3 593:4 608:4 714:4 728:4 733:1 800:2 860:2 866:4 896:4 908:4 913:5 919:4 968:3 994:5
3 104:4 260:5 480:4 552:4 590:4 593:3 648:3 653:4 733:5
1 1:5 48:5 150:5 260:4 527:5 531:4 588:4 594:4 595:5 608:4 661:3 720:3 745:3 783:4 914:3 919:4 938:4
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>apache.wiki</groupId>
<artifactId>RecommenderSystems</artifactId>
<version>1.0-SNAPSHOT</version>
<name>${project.artifactId}</name>
<description>My wonderfull scala app</description>
<inceptionYear>2015</inceptionYear>
<licenses>
<license>
<name>My License</name>
<url>http://....</url>
<distribution>repo</distribution>
</license>
</licenses>
<properties>
<maven.compiler.source>1.6</maven.compiler.source>
<maven.compiler.target>1.6</maven.compiler.target>
<encoding>UTF-8</encoding>
<scala.version>2.11.7</scala.version>
<scala.compat.version>2.11</scala.compat.version>
<spark.version>2.0.0</spark.version>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scalap</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-compiler</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- Test -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.specs2</groupId>
<artifactId>specs2-junit_${scala.compat.version}</artifactId>
<version>2.4.16</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.specs2</groupId>
<artifactId>specs2-core_${scala.compat.version}</artifactId>
<version>2.4.16</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_${scala.compat.version}</artifactId>
<version>2.2.4</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.11 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.compat.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.compat.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.compat.version}</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<!-- see http://davidb.github.com/scala-maven-plugin -->
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<!--<arg>-make:transitive</arg>-->
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.18.1</version>
<configuration>
<useFile>false</useFile>
<disableXmlReport>true</disableXmlReport>
<!-- If you have classpath issue like NoDefClassError,... -->
<!-- useManifestOnlyJar>false</useManifestOnlyJar -->
<includes>
<include>**/*Test.*</include>
<include>**/*Suite.*</include>
</includes>
</configuration>
</plugin>
</plugins>
</build>
</project>
/target/*
/.idea/*
/spark-warehouse/*
*.iml
/target/
/.settings/*
*.project
*.classpath
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>apache.wiki</groupId>
<artifactId>RecommenderSystems</artifactId>
<version>1.0-SNAPSHOT</version>
<name>${project.artifactId}</name>
<description>My wonderfull scala app</description>
<inceptionYear>2015</inceptionYear>
<licenses>
<license>
<name>My License</name>
<url>http://....</url>
<distribution>repo</distribution>
</license>
</licenses>
<properties>
<!--
<spark.version>2.1.0</spark.version>
-->
<spark.version>1.6.2</spark.version>
<scala.version>2.10</scala.version>
</properties>
<repositories>
<repository>
<id>scala-tools.org</id>
<name>Scala-Tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>scala-tools.org</id>
<name>Scala-Tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</pluginRepository>
</pluginRepositories>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version}</artifactId>
<version>${spark.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version}</artifactId>
<version>${spark.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.version}</artifactId>
<version>${spark.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.version}</artifactId>
<version>${spark.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<version>1.7.7</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.specs</groupId>
<artifactId>specs</artifactId>
<version>1.2.5</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-csv_2.10</artifactId>
<version>1.3.0</version>
<!--<scope>provided</scope>-->
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.2</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<createDependencyReducedPom>false</createDependencyReducedPom>
<finalName>example-${project.version}</finalName>
<artifactSet>
<excludes>
<exclude>oro*</exclude>
<!--exclude>org.apache.*:*</exclude-->
<exclude>junit:junit</exclude>
<exclude>org.*</exclude>
<exclude>au.*</exclude>
<exclude>codegen.*</exclude>
<exclude>images.*</exclude>
<exclude>javaewah.*</exclude>
<exclude>javassist.*</exclude>
<exclude>javax.*</exclude>
<exclude>javolution.*</exclude>
<exclude>jodd.*</exclude>
<exclude>parquet.*</exclude>
<exclude>repackage.*</exclude>
<exclude>templates.*</exclude>
<exclude>webapps.*</exclude>
<exclude>schemaorg_apache_xmlbeans.*</exclude>
<exclude>com.google.*</exclude>
<exclude>com.facebook.*</exclude>
<exclude>com.m.*</exclude>
<exclude>org.*</exclude>
<exclude>*:xml-apis</exclude>
<exclude>log4j*</exclude>
<exclude>org.antlr*</exclude>
<exclude>org.datanucleus*</exclude>
<exclude>net*</exclude>
<exclude>commons*</exclude>
<exclude>com.j*</exclude>
<exclude>com.x*</exclude>
<exclude>com.n*</exclude>
<exclude>com.i*</exclude>
<exclude>com.t*</exclude>
<exclude>com.c*</exclude>
<exclude>com.gi*</exclude>
<exclude>com.f*</exclude>
<exclude>com.su*</exclude>
<exclude>com.a*</exclude>
<exclude>com.e*</exclude>
<exclude>javax*</exclude>
<exclude>s*</exclude>
<exclude>i*</exclude>
<exclude>j*</exclude>
<exclude>a*</exclude>
<exclude>x*</exclude>
</excludes>
</artifactSet>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}.6</scalaVersion>
<args>
<arg>-target:jvm-1.5</arg>
</args>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-eclipse-plugin</artifactId>
<configuration>
<downloadSources>true</downloadSources>
<buildcommands>
<buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
</buildcommands>
<additionalProjectnatures>
<projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
</additionalProjectnatures>
<classpathContainers>
<classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
<classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
</classpathContainers>
</configuration>
</plugin>
</plugins>
</build>
<reporting>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<configuration>
<scalaVersion>${scala.version}.6</scalaVersion>
</configuration>
</plugin>
</plugins>
</reporting>
</project>
package com.apachecn.recommand.colfliter
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by ych on 2018/9/20.
*/
object Features {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Features Prepare").setMaster("local")
val sc = new SparkContext(conf)
val ratingsPath = "C:\\dtworkspace\\recommand\\data\\ratings"
val ratingsLibSVMPath = "C:\\dtworkspace\\recommand\\data\\ratingslibsvm"
/**
* 将输入的打分值,转为稀疏矩阵
* 例如: 输入为
* 1::661::3::978302109
* 1::914::3::978301968
* 转化之后结果为
* 1 661:3 914:3
*/
def changeRatings2LibSVM(): Unit ={
val ratingsRdd = sc.textFile(ratingsPath)
.map(_.split("::"))
.map(x=>(x(0),Array((x(1).toInt,x(2).toInt))))
.reduceByKey(_ ++ _)
.map(x=>(x._1+" " + x._2.sortBy(_._1).map(x=>(f"${x._1}:${x._2}")).mkString(" ")))
.coalesce(1).saveAsTextFile(ratingsLibSVMPath)
}
}
}
package com.apachecn.recommand.colfliter
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import scala.collection.BitSet
/**
* Created by ych on 2018/9/20.
* 基于物品的协同过滤
*/
class ItemCF {
/**
* 使用BitSet 计算jaccard 距离
*/
def computeJaccardSim(sc: SparkContext,
pathIn: String): RDD[(String,Double)] ={
val rdd = sc.textFile(pathIn)
.map(_.split(" ", 2)(1))
.zipWithIndex()
.map(x => (x._2.toInt,x._1.split(" ", -1)))
.map(x=>{
for (i <- x._2) yield {
(i.split("\\:")(0), x._1)
}
}).flatMap(x=>x)
.map(x=>(x._1,BitSet(x._2.toString.toInt))).reduceByKey(_.union(_))
val re = rdd.cartesian(rdd).map {
case((key0,set0),(key1,set1))=>{
val key=key0+"|"+key1
val j = (set0 &(set1)).size
val q = set0.union(set1).size
val re = j.toDouble/q
(key, re)
}
}
re
}
}
object ItemCF{
}
package com.apachecn.recommand.colfliter
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by ych on 2018/9/20.
*/
object sparkUtils {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("utils").setMaster("local")
val sc = new SparkContext(conf)
def selectData(): Unit ={
val in = "C:\\dtworkspace\\recommand\\data\\ratings"
val rdd =sc.textFile(in).map(x=>(x,x.split("::")(1).toInt)).filter(x=>(x._2 < 1000)).map(_._1).coalesce(1).saveAsTextFile(in+"out")
}
}
}
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 14 09:07:33 2018
@author: ych
E-mail:yao544303963@gmail.com
"""
import random
import sys
import math
from operator import itemgetter
random.seed(0)
class ItemCF(object):
def __init__(self):
self.trainset = {}
self.testset = {}
self.n_sim_movie = 20
self.n_rec_movie = 10
self.movie_sim_mat = {}
self.movie_popular = {}
self.movie_count = 0
print('Similar movie number = %d' % self.n_sim_movie, file=sys.stderr)
print('Recommendend movie number = %d' % self.n_rec_movie, file=sys.stderr)
def generate_dataset(self, filename, pivot=0.7):
trainset_len = 0
testset_len = 0
fp = open(filename, 'r')
for line in fp:
user, movie, rating, _ = line.split('::')
if random.random() < pivot:
self.trainset.setdefault(user, {})
self.trainset[user][movie] = int(rating)
trainset_len += 1
else:
self.testset.setdefault(user, {})
self.testset[user][movie] = int(rating)
testset_len += 1
print('split succ , trainset is %d , testset is %d' % (trainset_len, testset_len), file=sys.stderr)
def calc_movie_sim(self):
for user, movies in self.trainset.items():
for movie in movies:
if movie not in self.movie_popular:
self.movie_popular[movie] = 0
self.movie_popular[movie] += 1
print('count movies number and pipularity succ', file=sys.stderr)
self.movie_count = len(self.movie_popular)
print('total movie number = %d' % self.movie_count, file=sys.stderr)
itemsim_mat = self.movie_sim_mat
print('building co-rated users matrix', file=sys.stderr)
for user, movies in self.trainset.items():
for m1 in movies:
for m2 in movies:
if m1 == m2:
continue
itemsim_mat.setdefault(m1, {})
itemsim_mat[m1].setdefault(m2, 0)
itemsim_mat[m1][m2] += 1
print('build co-rated users matrix succ', file=sys.stderr)
print('calculating movie similarity matrix', file=sys.stderr)
simfactor_count = 0
PRINT_STEP = 2000000
for m1, related_movies in itemsim_mat.items():
for m2, count in related_movies.items():
itemsim_mat[m1][m2] = count / math.sqrt(self.movie_popular[m1] * self.movie_popular[m2])
simfactor_count += 1
if simfactor_count % PRINT_STEP == 0:
print('calcu movie similarity factor(%d)' % simfactor_count, file=sys.stderr)
print('calcu similiarity succ', file=sys.stderr)
def recommend(self, user):
K = self.n_sim_movie
N = self.n_rec_movie
rank = {}
watched_movies = self.trainset[user]
for movie, rating in watched_movies.items():
for related_movie, similarity_factor in sorted(self.movie_sim_mat[movie].items(), key=itemgetter(1),
reverse=True)[0:K]:
if related_movie in watched_movies:
continue
rank.setdefault(related_movie, 0)
rank[related_movie] += similarity_factor * rating
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
def evaluate(self):
print('evaluation start', file=sys.stderr)
N = self.n_rec_movie
hit = 0
rec_count = 0
test_count = 0
all_rec_movies = set()
popular_sum = 0
for i, user in enumerate(self.trainset):
if i % 500 == 0:
print('recommend for %d users ' % i, file=sys.stderr)
test_movies = self.testset.get(user, {})
rec_movies = self.recommend(user)
for movie, _ in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
popular_sum += math.log(1 + self.movie_popular[movie])
rec_count += N
test_count += len(test_movies)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count)
print('precision is %.4f\t recall is %.4f \t coverage is %.4f \t popularity is %.4f'
% (precision, recall, coverage, popularity), file=sys.stderr)
if __name__ == '__main__':
ratingfile = "C://workspace//data//ml-1m//ml-1m//ratings.dat"
item_cf = ItemCF()
item_cf.generate_dataset(ratingfile)
item_cf.calc_movie_sim()
item_cf.evaluate()
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 14 11:11:25 2018
@author: ych
E-mail:yao544303963@gmail.com
"""
from sklearn.externals.joblib import Memory
from sklearn.datasets import load_svmlight_file
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
np.set_printoptions(suppress=True)
mem = Memory("./mycache")
# 输入数据格式为
# User movie1:ratting1 movie2:ratting2
@mem.cache
def get_data(filename):
data = load_svmlight_file(filename)
return data[0], data[1]
# 计算jaccard 相似度
def get_jaccard_similarity(X):
n = X.shape[1]
similarity = np.zeros([n, n])
for i in range(n):
v1 = X.T[i].toarray()
for j in range(i + 1, n):
v2 = X.T[j].toarray()
sim = jaccard_similarity_score(v1, v2)
similarity[i][j] = sim
similarity[j][i] = sim
return similarity
# 计算余弦相似度
def get_consine_similarity(X):
similarity = cosine_similarity(X)
return similarity
filename = "C:/dtworkspace/recommand/data/ratingslibsvm"
X, y = get_data(filename)
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 17 10:55:09 2018
@author: ych
E-mail:yao544303963@gmail.com
"""
import sys
import random
import math
import os
from operator import itemgetter
random.seed(0)
class UserCF(object):
''' TopN recommendation - User Based Collaborative Filtering '''
def __init__(self):
self.trainset = {}
self.testset = {}
self.n_sim_user = 20
self.n_rec_movie = 10
self.user_sim_mat = {}
self.movie_popular = {}
self.movie_count = 0
print ('Similar user number = %d' % self.n_sim_user, file=sys.stderr)
print ('recommended movie number = %d' %
self.n_rec_movie, file=sys.stderr)
def generate_dataset(self, filename, pivot=0.7):
''' load rating data and split it to training set and test set '''
trainset_len = 0
testset_len = 0
fp = open(filename, 'r')
for line in fp:
user, movie, rating, _ = line.split('::')
# split the data by pivot
if random.random() < pivot:
self.trainset.setdefault(user, {})
self.trainset[user][movie] = int(rating)
trainset_len += 1
else:
self.testset.setdefault(user, {})
self.testset[user][movie] = int(rating)
testset_len += 1
print ('split training set and test set succ', file=sys.stderr)
print ('train set = %s' % trainset_len, file=sys.stderr)
print ('test set = %s' % testset_len, file=sys.stderr)
def calc_user_sim(self):
''' calculate user similarity matrix '''
# build inverse table for item-users
# key=movieID, value=list of userIDs who have seen this movie
print ('building movie-users inverse table...', file=sys.stderr)
movie2users = dict()
for user, movies in self.trainset.items():
for movie in movies:
# inverse table for item-users
if movie not in movie2users:
movie2users[movie] = set()
movie2users[movie].add(user)
# count item popularity at the same time
if movie not in self.movie_popular:
self.movie_popular[movie] = 0
self.movie_popular[movie] += 1
print ('build movie-users inverse table succ', file=sys.stderr)
# save the total movie number, which will be used in evaluation
self.movie_count = len(movie2users)
print ('total movie number = %d' % self.movie_count, file=sys.stderr)
# count co-rated items between users
usersim_mat = self.user_sim_mat
print ('building user co-rated movies matrix...', file=sys.stderr)
for movie, users in movie2users.items():
for u in users:
for v in users:
if u == v:
continue
usersim_mat.setdefault(u, {})
usersim_mat[u].setdefault(v, 0)
usersim_mat[u][v] += 1
print ('build user co-rated movies matrix succ', file=sys.stderr)
# calculate similarity matrix
print ('calculating user similarity matrix...', file=sys.stderr)
simfactor_count = 0
PRINT_STEP = 2000000
for u, related_users in usersim_mat.items():
for v, count in related_users.items():
usersim_mat[u][v] = count / math.sqrt(
len(self.trainset[u]) * len(self.trainset[v]))
simfactor_count += 1
if simfactor_count % PRINT_STEP == 0:
print ('calculating user similarity factor(%d)' %
simfactor_count, file=sys.stderr)
print ('calculate user similarity matrix(similarity factor) succ',
file=sys.stderr)
print ('Total similarity factor number = %d' %
simfactor_count, file=sys.stderr)
def recommend(self, user):
''' Find K similar users and recommend N movies. '''
K = self.n_sim_user
N = self.n_rec_movie
rank = dict()
watched_movies = self.trainset[user]
for similar_user, similarity_factor in sorted(self.user_sim_mat[user].items(),
key=itemgetter(1), reverse=True)[0:K]:
for movie in self.trainset[similar_user]:
if movie in watched_movies:
continue
# predict the user's "interest" for each movie
rank.setdefault(movie, 0)
rank[movie] += similarity_factor
# return the N best movies
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
def evaluate(self):
''' print evaluation result: precision, recall, coverage and popularity '''
print ('Evaluation start...', file=sys.stderr)
N = self.n_rec_movie
# varables for precision and recall
hit = 0
rec_count = 0
test_count = 0
# varables for coverage
all_rec_movies = set()
# varables for popularity
popular_sum = 0
for i, user in enumerate(self.trainset):
if i % 500 == 0:
print ('recommended for %d users' % i, file=sys.stderr)
test_movies = self.testset.get(user, {})
rec_movies = self.recommend(user)
for movie, _ in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
popular_sum += math.log(1 + self.movie_popular[movie])
rec_count += N
test_count += len(test_movies)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count)
print ('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' %
(precision, recall, coverage, popularity), file=sys.stderr)
if __name__ == '__main__':
ratingfile = "C://workspace//data//ml-1m//ml-1m//ratings.dat"
usercf = UserCF()
usercf.generate_dataset(ratingfile)
usercf.calc_user_sim()
usercf.evaluate()
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册