# -*- coding: UTF-8 -*- # 作者:huanhuilong # 标题:SK-Learn HelloWorld # 描述:使用 TF-IDF+Kmeans 对文本聚类 from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer from sklearn.decomposition import PCA import matplotlib.pyplot as plt import random def plot_result(data, cluster_res, cluster_num, algorithm='None'): nPoints = len(data) scatter_colors = ['blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown'] for i in range(cluster_num): color = scatter_colors[i % len(scatter_colors)] x1 = [] y1 = [] for j in range(nPoints): if cluster_res[j] == i: x1.append(data[j, 0]) y1.append(data[j, 1]) plt.scatter(x1, y1, c=color, alpha=1, marker='o') plt.plot(marksize=10) plt.savefig('/tmp/' + algorithm + '-' + str(random.randint(10, 100)) + str(cluster_num) + '.png') plt.show() def kmeans(sentences, num_of_class): # tfidf 向量化 vertorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.46) transformer = TfidfTransformer() freq_words_matrix = vertorizer.fit_transform(sentences) # 获取词袋 words = vertorizer.get_feature_names() tfidf = transformer.fit_transform(freq_words_matrix) weight = freq_words_matrix.toarray() trainingData = weight # K-Means 聚类 clf = KMeans( n_clusters=num_of_class, max_iter=10000, init="k-means++", tol=1e-6 ) result = clf.fit(trainingData) source = list(clf.predict(trainingData)) labels = clf.labels_ # # 显示聚类结果 plot_result(trainingData, source, num_of_class) if __name__ == "__main__": sentences = [ '今天 天气 很 好', '今天很 好', '今天 天气 很 好', '今天 天气 很 好', '今天 天', '今天 天气 很 好', '今天 天气 很 好', '今天 天气 很 好', '今天 天气 很 好', '今天 天', '今天 天气 很 好', '气 很 好', '今天 天气 很 差', '不错', '还行' ] kmeans(sentences, 3)