init

55ff5ea4 · Yao544303 · 4967e7c8 · 55ff5ea4 · 4967e7c8 · 4967e7c8
27 changed file
--- a/.gitignore
+++ b/.gitignore
 *.class
 *.log
+/spark/target
+/*/.idea
+/spark/derby.log
+/sparl/*.iml
+/standalone/mycache
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
--- a/conf/test.json
+++ b/conf/test.json
-{
-    "resources": {
-        "local_catch_file": "log/cache.txt",
-        "local_schema_file": "conf/ad_data.avsc"
-    },
-    "kafka":{
-        "topics": "datasys",
-        "consumer_group": "datasys_kafka2hive",
-        "brokerHosts": "zk1.common.ad.m.com:9092,zk2.common.ad.m.com:9092,zk3.common.ad.m.com:9092",
-        "zkHosts": "zk1.common.ad.m.com:2181,zk2.common.ad.m.com:2181,zk3.common.ad.m.com:2181",
-        "message_num": 400,
-        "root_dir": "/var/tmp"
-    },
-    "hdfs": {
-        "hdfs_name": "hadoopuser",
-        "hdfs_port": 50070,
-        "hdfs_host": "BJSH-ADHBASE-134-128.meitu-inc.com",
-        "hdfs_path": "/user/hadoopuser/jiangzl"
-    },
-    "hive": {
-        "hive_port": 10000,
-        "hive_host": "BJSH-ADHBASE-134-128.meitu-inc.com"
-    }
-}
\ No newline at end of file
--- a/input/hot_movies.csv
+++ b/input/hot_movies.csv
--- a/data/ratings
+++ b/data/ratings
+1::661::3::978302109
+1::914::3::978301968
+1::594::4::978302268
+1::919::4::978301368
+1::595::5::978824268
+1::938::4::978301752
+1::720::3::978300760
+1::527::5::978824195
+1::48::5::978824351
+1::745::3::978824268
+1::588::4::978824268
+1::783::4::978824291
+1::150::5::978301777
+1::1::5::978824268
+1::260::4::978300760
+1::531::4::978302149
+1::608::4::978301398
+2::647::3::978299351
+2::648::4::978299913
+2::434::2::978300174
+2::292::3::978300123
+2::902::2::978298905
+2::368::4::978300002
+2::110::5::978298625
+2::589::4::978299773
+2::982::4::978299269
+2::515::5::978298542
+2::442::3::978300025
+2::265::4::978299026
+2::480::5::978299809
+2::590::5::978299083
+2::736::4::978300100
+2::593::5::978298517
+2::95::2::978300143
+2::235::3::978299351
+2::163::4::978299809
+2::21::1::978299839
+2::165::3::978300002
+2::380::5::978299809
+2::349::4::978299839
+2::457::4::978299773
+2::920::5::978298775
+2::459::3::978300002
+2::780::3::978299966
+2::498::3::978299418
+2::318::5::978298413
+2::356::5::978299686
+3::648::3::978297867
+3::104::4::978298486
+3::653::4::978297757
+3::260::5::978297512
+3::552::4::978297837
+3::480::4::978297690
+3::733::5::978297757
+3::590::4::978297439
+3::593::3::978297018
+4::260::5::978294199
+4::480::4::978294008
+5::39::3::978245037
+5::288::2::978246585
+5::860::2::978244493
+5::866::4::978245334
+5::215::3::978245422
+5::501::1::978244001
+5::506::4::978245999
+5::509::4::978245829
+5::41::4::978244692
+5::47::3::978245334
+5::296::4::978244177
+5::581::3::978244808
+5::728::4::978244759
+5::299::3::978242934
+5::150::2::978245763
+5::224::3::978245829
+5::229::3::978246528
+5::6::2::978245916
+5::515::4::978245891
+5::800::2::978244540
+5::50::5::978244205
+5::52::2::978246479
+5::733::1::978245763
+5::377::4::978245999
+5::593::4::978244177
+5::162::4::978244624
+5::968::3::978242847
+5::896::4::978244493
+5::318::3::978244177
+5::176::4::978244568
+5::461::3::978244893
+5::608::4::978244177
+5::321::3::978245863
+5::908::4::978241072
+5::16::3::978245645
+5::265::3::978245037
+5::194::3::978246108
+5::551::4::978246504
+5::913::5::978242740
+5::919::4::978241072
+5::412::2::978245891
+5::994::5::978244540
+5::272::3::978245487
+5::24::1::978242934
+5::348::4::978245863
+5::29::5::978245065
+5::562::4::978244603
+5::497::3::978245687
+5::202::2::978246033
+5::353::2::978246504
+5::32::4::978244962
+5::34::4::978244603
+5::356::1::978241112
+5::357::2::978245829
+5::36::3::978244808
+5::714::4::978244493
--- a/data/ratingslibsvm
+++ b/data/ratingslibsvm
+4 260:5 480:4
+2 21:1 95:2 110:5 163:4 165:3 235:3 265:4 292:3 318:5 349:4 356:5 368:4 380:5 434:2 442:3 457:4 459:3 480:5 498:3 515:5 589:4 590:5 593:5 647:3 648:4 736:4 780:3 902:2 920:5 982:4
+5 6:2 16:3 24:1 29:5 32:4 34:4 36:3 39:3 41:4 47:3 50:5 52:2 150:2 162:4 176:4 194:3 202:2 215:3 224:3 229:3 265:3 272:3 288:2 296:4 299:3 318:3 321:3 348:4 353:2 356:1 357:2 377:4 412:2 461:3 497:3 501:1 506:4 509:4 515:4 551:4 562:4 581:3 593:4 608:4 714:4 728:4 733:1 800:2 860:2 866:4 896:4 908:4 913:5 919:4 968:3 994:5
+3 104:4 260:5 480:4 552:4 590:4 593:3 648:3 653:4 733:5
+1 1:5 48:5 150:5 260:4 527:5 531:4 588:4 594:4 595:5 608:4 661:3 720:3 745:3 783:4 914:3 919:4 938:4
--- a/input/user_movies.csv
+++ b/input/user_movies.csv
--- a/pom.xml
+++ b/pom.xml
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <groupId>apache.wiki</groupId>
-  <artifactId>RecommenderSystems</artifactId>
-  <version>1.0-SNAPSHOT</version>
-  <name>${project.artifactId}</name>
-  <description>My wonderfull scala app</description>
-  <inceptionYear>2015</inceptionYear>
-  <licenses>
-    <license>
-      <name>My License</name>
-      <url>http://....</url>
-      <distribution>repo</distribution>
-    </license>
-  </licenses>
-  <properties>
-    <maven.compiler.source>1.6</maven.compiler.source>
-    <maven.compiler.target>1.6</maven.compiler.target>
-    <encoding>UTF-8</encoding>
-    <scala.version>2.11.7</scala.version>
-    <scala.compat.version>2.11</scala.compat.version>
-    <spark.version>2.0.0</spark.version>
-  </properties>
-  <dependencies>
-    <dependency>
-      <groupId>org.scala-lang</groupId>
-      <artifactId>scala-library</artifactId>
-      <version>${scala.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.scala-lang</groupId>
-      <artifactId>scalap</artifactId>
-      <version>${scala.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.scala-lang</groupId>
-      <artifactId>scala-compiler</artifactId>
-      <version>${scala.version}</version>
-    </dependency>
-    <!-- Test -->
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <version>4.11</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-        <groupId>org.specs2</groupId>
-        <artifactId>specs2-junit_${scala.compat.version}</artifactId>
-        <version>2.4.16</version>
-        <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.specs2</groupId>
-      <artifactId>specs2-core_${scala.compat.version}</artifactId>
-      <version>2.4.16</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.compat.version}</artifactId>
-      <version>2.2.4</version>
-      <scope>test</scope>
-    </dependency>
-    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.11 -->
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-core_${scala.compat.version}</artifactId>
-      <version>${spark.version}</version>
-    </dependency>
-    <dependency>
-        <groupId>org.apache.spark</groupId>
-        <artifactId>spark-mllib_${scala.compat.version}</artifactId>
-        <version>${spark.version}</version>
-    </dependency>
-    <dependency>
-        <groupId>org.apache.spark</groupId>
-        <artifactId>spark-streaming_${scala.compat.version}</artifactId>
-        <version>${spark.version}</version>
-    </dependency>
-  </dependencies>
-  <build>
-    <sourceDirectory>src/main/scala</sourceDirectory>
-    <testSourceDirectory>src/test/scala</testSourceDirectory>
-    <plugins>
-      <plugin>
-        <!-- see http://davidb.github.com/scala-maven-plugin -->
-        <groupId>net.alchim31.maven</groupId>
-        <artifactId>scala-maven-plugin</artifactId>
-        <version>3.2.0</version>
-        <executions>
-          <execution>
-            <goals>
-              <goal>compile</goal>
-              <goal>testCompile</goal>
-            </goals>
-            <configuration>
-              <args>
-                <!--<arg>-make:transitive</arg>-->
-                <arg>-dependencyfile</arg>
-                <arg>${project.build.directory}/.scala_dependencies</arg>
-              </args>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-surefire-plugin</artifactId>
-        <version>2.18.1</version>
-        <configuration>
-          <useFile>false</useFile>
-          <disableXmlReport>true</disableXmlReport>
-          <!-- If you have classpath issue like NoDefClassError,... -->
-          <!-- useManifestOnlyJar>false</useManifestOnlyJar -->
-          <includes>
-            <include>**/*Test.*</include>
-            <include>**/*Suite.*</include>
-          </includes>
-        </configuration>
-      </plugin>
-    </plugins>
-  </build>
-</project>
--- a/spark/.gitignore
+++ b/spark/.gitignore
+/target/*
+/.idea/*
+/spark-warehouse/*
+*.iml
+/target/
+/.settings/*
+*.project
+*.classpath
--- a/bin/start.sh
+++ b/bin/start.sh
--- a/spark/pom.xml
+++ b/spark/pom.xml
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>apache.wiki</groupId>
+  <artifactId>RecommenderSystems</artifactId>
+  <version>1.0-SNAPSHOT</version>
+  <name>${project.artifactId}</name>
+  <description>My wonderfull scala app</description>
+  <inceptionYear>2015</inceptionYear>
+  <licenses>
+    <license>
+      <name>My License</name>
+      <url>http://....</url>
+      <distribution>repo</distribution>
+    </license>
+  </licenses>
+  <properties>
+      <!--
+    <spark.version>2.1.0</spark.version>
+    -->
+        <spark.version>1.6.2</spark.version>
+        <scala.version>2.10</scala.version>
+  </properties>
+  <repositories>
+    <repository>
+      <id>scala-tools.org</id>
+      <name>Scala-Tools Maven2 Repository</name>
+      <url>http://scala-tools.org/repo-releases</url>
+    </repository>
+  </repositories>
+  <pluginRepositories>
+    <pluginRepository>
+      <id>scala-tools.org</id>
+      <name>Scala-Tools Maven2 Repository</name>
+      <url>http://scala-tools.org/repo-releases</url>
+    </pluginRepository>
+  </pluginRepositories>
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_${scala.version}</artifactId>
+      <version>${spark.version}</version>
+        <!--<scope>provided</scope>-->
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.version}</artifactId>
+      <version>${spark.version}</version>
+        <!--<scope>provided</scope>-->
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-hive_${scala.version}</artifactId>
+      <version>${spark.version}</version>
+        <!--<scope>provided</scope>-->
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-mllib_${scala.version}</artifactId>
+      <version>${spark.version}</version>
+        <!--<scope>provided</scope>-->
+    </dependency>
+    <dependency>
+      <groupId>org.apache.avro</groupId>
+      <artifactId>avro</artifactId>
+      <version>1.7.7</version>
+        <!--<scope>provided</scope>-->
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.4</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.specs</groupId>
+      <artifactId>specs</artifactId>
+      <version>1.2.5</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.databricks</groupId>
+      <artifactId>spark-csv_2.10</artifactId>
+      <version>1.3.0</version>
+        <!--<scope>provided</scope>-->
+    </dependency>
+  </dependencies>
+  <build>
+    <sourceDirectory>src/main/scala</sourceDirectory>
+    <testSourceDirectory>src/test/scala</testSourceDirectory>
+    <plugins>
+        <plugin>
+            <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-shade-plugin</artifactId>
+            <version>2.4.2</version>
+            <executions>
+                <execution>
+                    <phase>package</phase>
+                    <goals>
+                        <goal>shade</goal>
+                    </goals>
+                    <configuration>
+                        <createDependencyReducedPom>false</createDependencyReducedPom>
+                        <finalName>example-${project.version}</finalName>
+                        <artifactSet>
+                            <excludes>
+                                <exclude>oro*</exclude>
+                                <!--exclude>org.apache.*:*</exclude-->
+                                <exclude>junit:junit</exclude>
+                                <exclude>org.*</exclude>
+                                <exclude>au.*</exclude>
+                                <exclude>codegen.*</exclude>
+                                <exclude>images.*</exclude>
+                                <exclude>javaewah.*</exclude>
+                                <exclude>javassist.*</exclude>
+                                <exclude>javax.*</exclude>
+                                <exclude>javolution.*</exclude>
+                                <exclude>jodd.*</exclude>
+                                <exclude>parquet.*</exclude>
+                                <exclude>repackage.*</exclude>
+                                <exclude>templates.*</exclude>
+                                <exclude>webapps.*</exclude>
+                                <exclude>schemaorg_apache_xmlbeans.*</exclude>
+                                <exclude>com.google.*</exclude>
+                                <exclude>com.facebook.*</exclude>
+                                <exclude>com.m.*</exclude>
+                                <exclude>org.*</exclude>
+                                <exclude>*:xml-apis</exclude>
+                                <exclude>log4j*</exclude>
+                                <exclude>org.antlr*</exclude>
+                                <exclude>org.datanucleus*</exclude>
+                                <exclude>net*</exclude>
+                                <exclude>commons*</exclude>
+                                <exclude>com.j*</exclude>
+                                <exclude>com.x*</exclude>
+                                <exclude>com.n*</exclude>
+                                <exclude>com.i*</exclude>
+                                <exclude>com.t*</exclude>
+                                <exclude>com.c*</exclude>
+                                <exclude>com.gi*</exclude>
+                                <exclude>com.f*</exclude>
+                                <exclude>com.su*</exclude>
+                                <exclude>com.a*</exclude>
+                                <exclude>com.e*</exclude>
+                                <exclude>javax*</exclude>
+                                <exclude>s*</exclude>
+                                <exclude>i*</exclude>
+                                <exclude>j*</exclude>
+                                <exclude>a*</exclude>
+                                <exclude>x*</exclude>
+                            </excludes>
+                        </artifactSet>
+                    </configuration>
+                </execution>
+            </executions>
+        </plugin>
+        <plugin>
+        <groupId>org.scala-tools</groupId>
+        <artifactId>maven-scala-plugin</artifactId>
+        <executions>
+          <execution>
+            <goals>
+              <goal>compile</goal>
+              <goal>testCompile</goal>
+            </goals>
+          </execution>
+        </executions>
+        <configuration>
+          <scalaVersion>${scala.version}.6</scalaVersion>
+          <args>
+            <arg>-target:jvm-1.5</arg>
+          </args>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-eclipse-plugin</artifactId>
+        <configuration>
+          <downloadSources>true</downloadSources>
+          <buildcommands>
+            <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
+          </buildcommands>
+          <additionalProjectnatures>
+            <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
+          </additionalProjectnatures>
+          <classpathContainers>
+            <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
+            <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
+          </classpathContainers>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+  <reporting>
+    <plugins>
+      <plugin>
+        <groupId>org.scala-tools</groupId>
+        <artifactId>maven-scala-plugin</artifactId>
+          <version>2.15.2</version>
+        <configuration>
+          <scalaVersion>${scala.version}.6</scalaVersion>
+        </configuration>
+      </plugin>
+    </plugins>
+  </reporting>
+</project>
--- a/src/main/scala/apache/wiki/App.scala
+++ b/src/main/scala/apache/wiki/App.scala
--- a/src/main/scala/apache/wiki/CheckPointWordCount.scala
+++ b/src/main/scala/apache/wiki/CheckPointWordCount.scala
--- a/src/main/scala/apache/wiki/OfflineRecommender.scala
+++ b/src/main/scala/apache/wiki/OfflineRecommender.scala
--- a/src/main/scala/apache/wiki/OnlineRecommender.scala
+++ b/src/main/scala/apache/wiki/OnlineRecommender.scala
--- a/src/main/scala/apache/wiki/StreamingWordCount.scala
+++ b/src/main/scala/apache/wiki/StreamingWordCount.scala
--- a/src/main/scala/apache/wiki/WordCount.scala
+++ b/src/main/scala/apache/wiki/WordCount.scala
--- a/spark/src/main/scala/com/apachecn/recommand/colfliter/Features.scala
+++ b/spark/src/main/scala/com/apachecn/recommand/colfliter/Features.scala
+package com.apachecn.recommand.colfliter
+import org.apache.spark.{SparkConf, SparkContext}
+/**
+  * Created by ych on 2018/9/20.
+  */
+object Features {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("Features Prepare").setMaster("local")
+    val sc = new SparkContext(conf)
+    val ratingsPath = "C:\\dtworkspace\\recommand\\data\\ratings"
+    val ratingsLibSVMPath =  "C:\\dtworkspace\\recommand\\data\\ratingslibsvm"
+    /**
+      * 将输入的打分值，转为稀疏矩阵
+      * 例如： 输入为
+      *   1::661::3::978302109
+      *   1::914::3::978301968
+      * 转化之后结果为
+      *  1 661:3 914:3
+      */
+    def changeRatings2LibSVM(): Unit ={
+       val ratingsRdd = sc.textFile(ratingsPath)
+        .map(_.split("::"))
+        .map(x=>(x(0),Array((x(1).toInt,x(2).toInt))))
+        .reduceByKey(_ ++ _)
+        .map(x=>(x._1+" " + x._2.sortBy(_._1).map(x=>(f"${x._1}:${x._2}")).mkString(" ")))
+        .coalesce(1).saveAsTextFile(ratingsLibSVMPath)
+    }
+  }
+}
--- a/spark/src/main/scala/com/apachecn/recommand/colfliter/ItemCF.scala
+++ b/spark/src/main/scala/com/apachecn/recommand/colfliter/ItemCF.scala
+package com.apachecn.recommand.colfliter
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import scala.collection.BitSet
+/**
+  * Created by ych on 2018/9/20.
+  * 基于物品的协同过滤
+  */
+class ItemCF {
+  /**
+    * 使用BitSet 计算jaccard 距离
+    */
+  def computeJaccardSim(sc: SparkContext,
+                        pathIn: String): RDD[(String,Double)] ={
+    val rdd = sc.textFile(pathIn)
+      .map(_.split(" ", 2)(1))
+      .zipWithIndex()
+      .map(x => (x._2.toInt,x._1.split(" ", -1)))
+      .map(x=>{
+        for (i <- x._2) yield {
+          (i.split("\\:")(0), x._1)
+        }
+      }).flatMap(x=>x)
+      .map(x=>(x._1,BitSet(x._2.toString.toInt))).reduceByKey(_.union(_))
+    val re = rdd.cartesian(rdd).map {
+      case((key0,set0),(key1,set1))=>{
+        val key=key0+"|"+key1
+        val j = (set0 &(set1)).size
+        val q = set0.union(set1).size
+        val re = j.toDouble/q
+        (key, re)
+      }
+    }
+    re
+  }
+}
+object ItemCF{
+}
--- a/spark/src/main/scala/com/apachecn/recommand/colfliter/sparkUtils.scala
+++ b/spark/src/main/scala/com/apachecn/recommand/colfliter/sparkUtils.scala
+package com.apachecn.recommand.colfliter
+import org.apache.spark.{SparkConf, SparkContext}
+/**
+  * Created by ych on 2018/9/20.
+  */
+object sparkUtils {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("utils").setMaster("local")
+    val sc = new SparkContext(conf)
+    def selectData(): Unit ={
+      val in = "C:\\dtworkspace\\recommand\\data\\ratings"
+      val rdd =sc.textFile(in).map(x=>(x,x.split("::")(1).toInt)).filter(x=>(x._2 < 1000)).map(_._1).coalesce(1).saveAsTextFile(in+"out")
+    }
+  }
+}
--- a/src/test/scala/samples/junit.scala
+++ b/src/test/scala/samples/junit.scala
--- a/src/test/scala/samples/scalatest.scala
+++ b/src/test/scala/samples/scalatest.scala
--- a/src/test/scala/samples/specs.scala
+++ b/src/test/scala/samples/specs.scala
--- a/output/test
+++ b/output/test
--- a/standalone/col_filtering/item_cf.py
+++ b/standalone/col_filtering/item_cf.py
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep 14 09:07:33 2018
+@author: ych
+E-mail:yao544303963@gmail.com
+"""
+import random
+import sys
+import math
+from operator import itemgetter
+random.seed(0)
+class ItemCF(object):
+    def __init__(self):
+        self.trainset = {}
+        self.testset = {}
+        self.n_sim_movie = 20
+        self.n_rec_movie = 10
+        self.movie_sim_mat = {}
+        self.movie_popular = {}
+        self.movie_count = 0
+        print('Similar movie number = %d' % self.n_sim_movie, file=sys.stderr)
+        print('Recommendend movie number = %d' % self.n_rec_movie, file=sys.stderr)
+    def generate_dataset(self, filename, pivot=0.7):
+        trainset_len = 0
+        testset_len = 0
+        fp = open(filename, 'r')
+        for line in fp:
+            user, movie, rating, _ = line.split('::')
+            if random.random() < pivot:
+                self.trainset.setdefault(user, {})
+                self.trainset[user][movie] = int(rating)
+                trainset_len += 1
+            else:
+                self.testset.setdefault(user, {})
+                self.testset[user][movie] = int(rating)
+                testset_len += 1
+        print('split succ , trainset is %d , testset is %d' % (trainset_len, testset_len), file=sys.stderr)
+    def calc_movie_sim(self):
+        for user, movies in self.trainset.items():
+            for movie in movies:
+                if movie not in self.movie_popular:
+                    self.movie_popular[movie] = 0
+                self.movie_popular[movie] += 1
+        print('count movies number and pipularity succ', file=sys.stderr)
+        self.movie_count = len(self.movie_popular)
+        print('total movie number = %d' % self.movie_count, file=sys.stderr)
+        itemsim_mat = self.movie_sim_mat
+        print('building co-rated users matrix', file=sys.stderr)
+        for user, movies in self.trainset.items():
+            for m1 in movies:
+                for m2 in movies:
+                    if m1 == m2:
+                        continue
+                    itemsim_mat.setdefault(m1, {})
+                    itemsim_mat[m1].setdefault(m2, 0)
+                    itemsim_mat[m1][m2] += 1
+        print('build co-rated users matrix succ', file=sys.stderr)
+        print('calculating movie similarity matrix', file=sys.stderr)
+        simfactor_count = 0
+        PRINT_STEP = 2000000
+        for m1, related_movies in itemsim_mat.items():
+            for m2, count in related_movies.items():
+                itemsim_mat[m1][m2] = count / math.sqrt(self.movie_popular[m1] * self.movie_popular[m2])
+                simfactor_count += 1
+                if simfactor_count % PRINT_STEP == 0:
+                    print('calcu movie similarity factor(%d)' % simfactor_count, file=sys.stderr)
+        print('calcu similiarity succ', file=sys.stderr)
+    def recommend(self, user):
+        K = self.n_sim_movie
+        N = self.n_rec_movie
+        rank = {}
+        watched_movies = self.trainset[user]
+        for movie, rating in watched_movies.items():
+            for related_movie, similarity_factor in sorted(self.movie_sim_mat[movie].items(), key=itemgetter(1),
+                                                           reverse=True)[0:K]:
+                if related_movie in watched_movies:
+                    continue
+                rank.setdefault(related_movie, 0)
+                rank[related_movie] += similarity_factor * rating
+        return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
+    def evaluate(self):
+        print('evaluation start', file=sys.stderr)
+        N = self.n_rec_movie
+        hit = 0
+        rec_count = 0
+        test_count = 0
+        all_rec_movies = set()
+        popular_sum = 0
+        for i, user in enumerate(self.trainset):
+            if i % 500 == 0:
+                print('recommend for %d users ' % i, file=sys.stderr)
+            test_movies = self.testset.get(user, {})
+            rec_movies = self.recommend(user)
+            for movie, _ in rec_movies:
+                if movie in test_movies:
+                    hit += 1
+                all_rec_movies.add(movie)
+                popular_sum += math.log(1 + self.movie_popular[movie])
+            rec_count += N
+            test_count += len(test_movies)
+            precision = hit / (1.0 * rec_count)
+            recall = hit / (1.0 * test_count)
+            coverage = len(all_rec_movies) / (1.0 * self.movie_count)
+            popularity = popular_sum / (1.0 * rec_count)
+            print('precision is %.4f\t recall is %.4f \t coverage is %.4f \t popularity is %.4f'
+                  % (precision, recall, coverage, popularity), file=sys.stderr)
+if __name__ == '__main__':
+    ratingfile = "C://workspace//data//ml-1m//ml-1m//ratings.dat"
+    item_cf = ItemCF()
+    item_cf.generate_dataset(ratingfile)
+    item_cf.calc_movie_sim()
+    item_cf.evaluate()
--- a/standalone/col_filtering/similarity_by_sklearn.py
+++ b/standalone/col_filtering/similarity_by_sklearn.py
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep 14 11:11:25 2018
+@author: ych
+E-mail:yao544303963@gmail.com
+"""
+from sklearn.externals.joblib import Memory
+from sklearn.datasets import load_svmlight_file
+from sklearn.metrics import jaccard_similarity_score
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.metrics.pairwise import pairwise_distances
+import numpy as np
+np.set_printoptions(suppress=True)
+mem = Memory("./mycache")
+# 输入数据格式为
+# User movie1:ratting1 movie2:ratting2
+@mem.cache
+def get_data(filename):
+    data = load_svmlight_file(filename)
+    return data[0], data[1]
+# 计算jaccard 相似度
+def get_jaccard_similarity(X):
+    n = X.shape[1]
+    similarity = np.zeros([n, n])
+    for i in range(n):
+        v1 = X.T[i].toarray()
+        for j in range(i + 1, n):
+            v2 = X.T[j].toarray()
+            sim = jaccard_similarity_score(v1, v2)
+            similarity[i][j] = sim
+            similarity[j][i] = sim
+    return similarity
+# 计算余弦相似度
+def get_consine_similarity(X):
+    similarity = cosine_similarity(X)
+    return similarity
+filename = "C:/dtworkspace/recommand/data/ratingslibsvm"
+X, y = get_data(filename)
--- a/standalone/col_filtering/user_cf.py
+++ b/standalone/col_filtering/user_cf.py
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Sep 17 10:55:09 2018
+@author: ych
+E-mail:yao544303963@gmail.com
+"""
+import sys
+import random
+import math
+import os
+from operator import itemgetter
+random.seed(0)
+class UserCF(object):
+    ''' TopN recommendation - User Based Collaborative Filtering '''
+    def __init__(self):
+        self.trainset = {}
+        self.testset = {}
+        self.n_sim_user = 20
+        self.n_rec_movie = 10
+        self.user_sim_mat = {}
+        self.movie_popular = {}
+        self.movie_count = 0
+        print ('Similar user number = %d' % self.n_sim_user, file=sys.stderr)
+        print ('recommended movie number = %d' %
+               self.n_rec_movie, file=sys.stderr)
+    def generate_dataset(self, filename, pivot=0.7):
+        ''' load rating data and split it to training set and test set '''
+        trainset_len = 0
+        testset_len = 0
+        fp = open(filename, 'r')
+        for line in fp:
+            user, movie, rating, _ = line.split('::')
+            # split the data by pivot
+            if random.random() < pivot:
+                self.trainset.setdefault(user, {})
+                self.trainset[user][movie] = int(rating)
+                trainset_len += 1
+            else:
+                self.testset.setdefault(user, {})
+                self.testset[user][movie] = int(rating)
+                testset_len += 1
+        print ('split training set and test set succ', file=sys.stderr)
+        print ('train set = %s' % trainset_len, file=sys.stderr)
+        print ('test set = %s' % testset_len, file=sys.stderr)
+    def calc_user_sim(self):
+        ''' calculate user similarity matrix '''
+        # build inverse table for item-users
+        # key=movieID, value=list of userIDs who have seen this movie
+        print ('building movie-users inverse table...', file=sys.stderr)
+        movie2users = dict()
+        for user, movies in self.trainset.items():
+            for movie in movies:
+                # inverse table for item-users
+                if movie not in movie2users:
+                    movie2users[movie] = set()
+                movie2users[movie].add(user)
+                # count item popularity at the same time
+                if movie not in self.movie_popular:
+                    self.movie_popular[movie] = 0
+                self.movie_popular[movie] += 1
+        print ('build movie-users inverse table succ', file=sys.stderr)
+        # save the total movie number, which will be used in evaluation
+        self.movie_count = len(movie2users)
+        print ('total movie number = %d' % self.movie_count, file=sys.stderr)
+        # count co-rated items between users
+        usersim_mat = self.user_sim_mat
+        print ('building user co-rated movies matrix...', file=sys.stderr)
+        for movie, users in movie2users.items():
+            for u in users:
+                for v in users:
+                    if u == v:
+                        continue
+                    usersim_mat.setdefault(u, {})
+                    usersim_mat[u].setdefault(v, 0)
+                    usersim_mat[u][v] += 1
+        print ('build user co-rated movies matrix succ', file=sys.stderr)
+        # calculate similarity matrix
+        print ('calculating user similarity matrix...', file=sys.stderr)
+        simfactor_count = 0
+        PRINT_STEP = 2000000
+        for u, related_users in usersim_mat.items():
+            for v, count in related_users.items():
+                usersim_mat[u][v] = count / math.sqrt(
+                    len(self.trainset[u]) * len(self.trainset[v]))
+                simfactor_count += 1
+                if simfactor_count % PRINT_STEP == 0:
+                    print ('calculating user similarity factor(%d)' %
+                           simfactor_count, file=sys.stderr)
+        print ('calculate user similarity matrix(similarity factor) succ',
+               file=sys.stderr)
+        print ('Total similarity factor number = %d' %
+               simfactor_count, file=sys.stderr)
+    def recommend(self, user):
+        ''' Find K similar users and recommend N movies. '''
+        K = self.n_sim_user
+        N = self.n_rec_movie
+        rank = dict()
+        watched_movies = self.trainset[user]
+        for similar_user, similarity_factor in sorted(self.user_sim_mat[user].items(),
+                                                      key=itemgetter(1), reverse=True)[0:K]:
+            for movie in self.trainset[similar_user]:
+                if movie in watched_movies:
+                    continue
+                # predict the user's "interest" for each movie
+                rank.setdefault(movie, 0)
+                rank[movie] += similarity_factor
+        # return the N best movies
+        return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
+    def evaluate(self):
+        ''' print evaluation result: precision, recall, coverage and popularity '''
+        print ('Evaluation start...', file=sys.stderr)
+        N = self.n_rec_movie
+        #  varables for precision and recall
+        hit = 0
+        rec_count = 0
+        test_count = 0
+        # varables for coverage
+        all_rec_movies = set()
+        # varables for popularity
+        popular_sum = 0
+        for i, user in enumerate(self.trainset):
+            if i % 500 == 0:
+                print ('recommended for %d users' % i, file=sys.stderr)
+            test_movies = self.testset.get(user, {})
+            rec_movies = self.recommend(user)
+            for movie, _ in rec_movies:
+                if movie in test_movies:
+                    hit += 1
+                all_rec_movies.add(movie)
+                popular_sum += math.log(1 + self.movie_popular[movie])
+            rec_count += N
+            test_count += len(test_movies)
+        precision = hit / (1.0 * rec_count)
+        recall = hit / (1.0 * test_count)
+        coverage = len(all_rec_movies) / (1.0 * self.movie_count)
+        popularity = popular_sum / (1.0 * rec_count)
+        print ('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' %
+               (precision, recall, coverage, popularity), file=sys.stderr)
+if __name__ == '__main__':
+    ratingfile = "C://workspace//data//ml-1m//ml-1m//ratings.dat"
+    usercf = UserCF()
+    usercf.generate_dataset(ratingfile)
+    usercf.calc_user_sim()
+    usercf.evaluate()
\ No newline at end of file