提交 2965e228 编写于 作者: wnma3mz's avatar wnma3mz

commit chapter5

上级 670c528d
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 20 16:06:09 2017
@author: wnma3
"""
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.tree import export_graphviz
from sklearn.manifold import TSNE
from sklearn.externals.six import StringIO
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.stattools import adfuller as ADF
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.arima_model import ARIMA
from decimal import Decimal
"""
cm_plot-->自定义混淆矩阵可视化
density_plot-->自定义概率密度图函数
programmer_1-->使用线性回归找出相关系数,使用随机森林算出每个系数得分
programmer_2-->使用决策数模型,生成决策树过程并保存为dot文件,天气、周末、促销决定销量
programmer_3-->使用Keras神经网络模型,训练数据预测销量高低
programmer_4-->使用KMeans聚类,做可视化操作(概率密度图)
programmer_5-->继programmer_4将数据做降维处理,并且可视化不同聚类的类别
programmer_6-->进行白噪声、平稳性检测,建立ARIMA(0, 1, 1)模型预测之后五天的结果
programmer_7-->使用Kmeans聚类之后,画出散点图,标记离群点
find_rule-->寻找关联规则的函数
connect_string-->自定义连接函数,用于实现L_{k-1}到C_k的连接
programmer_8-->菜单中各个菜品的关联程度
"""
def cm_plot(y, yp):
cm = confusion_matrix(y, yp)
plt.matshow(cm, cmap=plt.cm.Greens)
plt.colorbar()
for x in range(len(cm)):
for y in range(len(cm)):
plt.annotate(cm[x, y], xy=(
x, y), horizontalalignment='center', verticalalignment='center')
plt.ylabel('True label')
plt.xlabel('Predicted label')
return plt
def density_plot(data, k):
p = data.plot(kind='kde', linewidth=2, subplots=True, sharex=False)
[p[i].set_ylabel(u'密度') for i in range(k)]
plt.legend()
return plt
def programmer_1():
filename = "data/bankloan.xls"
data = pd.read_excel(filename)
x = data.iloc[:, :8].as_matrix()
y = data.iloc[:, 8].as_matrix()
rlr = RLR()
rlr.fit(x, y)
rlr_support = rlr.get_support()
support_col = data.columns[rlr_support]
print("rlr_support_columns: {columns}".format(
columns=','.join(support_col)))
x = data[data.columns[rlr.get_support()]].as_matrix()
lr = LR()
lr.fit(x, y)
print("lr: {score}".format(score=lr.score(x, y)))
def programmer_2():
inputfile = "data/sales_data.xls"
data = pd.read_excel(inputfile, index_col=u'序号')
data[data == u'是'] = 1
data[data == u'高'] = 1
data[data == u'好'] = 1
data[data != 1] = -1
x = data.iloc[:, :3].as_matrix().astype(int)
y = data.iloc[:, 3].as_matrix().astype(int)
dtc = DTC()
dtc.fit(x, y)
x = pd.DataFrame(x)
x = pd.DataFrame(x)
with open("tree.dot", "w") as f:
f = export_graphviz(dtc, feature_names=x.columns, out_file=f)
def programmer_3():
inputfile = "data/sales_data.xls"
data = pd.read_excel(inputfile, index_col=u'序号')
data[data == u'好'] = 1
data[data == u'是'] = 1
data[data == u'高'] = 1
data[data != 1] = 0
x = data.iloc[:, :3].as_matrix().astype(int)
y = data.iloc[:, 3].as_matrix().astype(int)
# 进行神经网络训练
model = Sequential()
model.add(Dense(input_dim=3, output_dim=10))
model.add(Activation('relu'))
model.add(Dense(input_dim=10, output_dim=1))
model.add(Activation('sigmoid'))
# 修正神经网络
model.compile(loss='binary_crossentropy', optimizer='adam')
model.fit(x, y, nb_epoch=1000, batch_size=10)
yp = model.predict_classes(x).reshape(len(y))
cm_plot(y, yp).show()
def programmer_4():
inputfile = 'data/consumption_data.xls'
outputfile = 'tmp/data_type.xls'
"""
k: 聚类类别
iteration: 聚类循环次数
model.labels_: 聚类类别
model.cluster_centers_: 聚类中心
"""
k = 3
iteration = 500
data = pd.read_excel(inputfile, index_col='Id')
data_zs = 1.0 * (data - data.mean()) / data.std()
model = KMeans(n_clusters=k, n_jobs=4, max_iter=iteration)
model.fit(data_zs)
# 统计各个类别的数目
r1 = pd.Series(model.labels_).value_counts()
r2 = pd.DataFrame(model.cluster_centers_)
r = pd.concat([r2, r1], axis=1)
r.columns = list(data.columns) + [u'类别数目']
print(r)
# 详细输出每个样本对应的类别
r = pd.concat([data, pd.Series(model.labels_, index=data.index)], axis=1)
r.columns = list(data.columns) + [u'聚类类别']
r.to_excel(outputfile)
# 保存概率密度图
pic_output = 'tmp/pd_'
for i in range(k):
density_plot(data[r[u'聚类类别'] == i], k).savefig(
u'%s%s.png' % (pic_output, i))
return data_zs, r
def programmer_5(data_zs, r):
# 进行数据降维
tsne = TSNE()
tsne.fit_transform(data_zs)
tsne = pd.DataFrame(tsne.embedding_, index=data_zs.index)
# 不同类别用不同颜色和样式绘图
d = tsne[r[u'聚类类别'] == 0]
plt.plot(d[0], d[1], 'r.')
d = tsne[r[u'聚类类别'] == 1]
plt.plot(d[0], d[1], 'go')
d = tsne[r[u'聚类类别'] == 2]
plt.plot(d[0], d[1], 'b*')
plt.show()
def programmer_6():
"""
警告解释:
# UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
"matplotlib is currently using a non-GUI backend, "
调用了多次plt.show()
# RuntimeWarning: overflow encountered in exp
运算精度不够
forecastnum-->预测天数
plot_acf().show()-->自相关图
plot_pacf().show()-->偏自相关图
"""
discfile = 'data/arima_data.xls'
forecastnum = 5
data = pd.read_excel(discfile, index_col=u'日期')
plot_acf(data).show()
# 平稳性检测
print(u'原始序列的ADF检验结果为:', ADF(data[u'销量']))
# 返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore
# 差分后的结果
D_data = data.diff().dropna()
D_data.columns = [u'销量差分']
# 时序图
D_data.plot()
plt.show()
plot_acf(D_data).show()
plot_pacf(D_data).show()
print(u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分'])) # 平稳性检测
# 白噪声检验
print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1)) # 返回统计量和p值
data[u'销量'] = data[u'销量'].astype(float)
# 定阶
pmax = int(len(D_data) / 10) # 一般阶数不超过length/10
qmax = int(len(D_data) / 10) # 一般阶数不超过length/10
bic_matrix = [] # bic矩阵
# 存在部分报错,所以用try来跳过报错。
for p in range(pmax + 1):
tmp = []
for q in range(qmax + 1):
try:
tmp.append(ARIMA(data, (p, 1, q)).fit().bic)
except:
tmp.append(None)
bic_matrix.append(tmp)
# 从中可以找出最小值
bic_matrix = pd.DataFrame(bic_matrix)
# 用stack展平,然后用idxmin找出最小值位置。
p, q = bic_matrix.stack().idxmin()
print(u'BIC最小的p值和q值为:%s、%s' % (p, q))
model = ARIMA(data, (p, 1, q)).fit() # 建立ARIMA(0, 1, 1)模型
model.summary2() # 给出一份模型报告
model.forecast(forecastnum) # 作为期5天的预测,返回预测结果、标准误差、置信区间。
def programmer_7():
"""
k:聚类中心数
threshold:离散点阈值
iteration:聚类最大循环次数
"""
inputfile = 'data/consumption_data.xls'
k = 3
threshold = 2
iteration = 500
data = pd.read_excel(inputfile, index_col='Id')
# 数据标准化
data_zs = 1.0 * (data - data.mean()) / data.std()
model = KMeans(n_clusters=k, n_jobs=4, max_iter=iteration)
model.fit(data_zs)
# 标准化数据及其类别
# 每个样本对应的类别
r = pd.concat(
[data_zs, pd.Series(model.labels_, index=data.index)], axis=1)
r.columns = list(data.columns) + [u'聚类类别']
norm = []
for i in range(k): # 逐一处理
norm_tmp = r[['R', 'F', 'M']][r[u'聚类类别'] == i] - \
model.cluster_centers_[i]
# 求出绝对距离
norm_tmp = norm_tmp.apply(np.linalg.norm, axis=1)
# 求相对距离并添加
norm.append(norm_tmp / norm_tmp.median())
norm = pd.concat(norm)
# 正常点
norm[norm <= threshold].plot(style='go')
# 离群点
discrete_points = norm[norm > threshold]
discrete_points.plot(style='ro')
# 标记离群点
for i in range(len(discrete_points)):
id = discrete_points.index[i]
n = discrete_points.iloc[i]
plt.annotate('(%s, %0.2f)' % (id, n), xy=(id, n), xytext=(id, n))
plt.xlabel(u'编号')
plt.ylabel(u'相对距离')
plt.show()
def connect_string(x, ms):
x = list(map(lambda i: sorted(i.split(ms)), x))
l = len(x[0])
r = []
for i in range(len(x)):
for j in range(i, len(x)):
if x[i][:l - 1] == x[j][:l - 1] and x[i][l - 1] != x[j][l - 1]:
r.append(x[i][:l - 1] + sorted([x[j][l - 1], x[i][l - 1]]))
return r
def find_rule(d, support, confidence, ms=u'--'):
# 定义输出结果
result = pd.DataFrame(index=['support', 'confidence'])
# 支持度序列
support_series = 1.0*d.sum()/len(d)
# 支持度第一次筛选
column = list(support_series[support_series > support].index)
k = 0
while len(column) > 1:
k = k + 1
print(u'\n正在进行第%s次搜索...' % k)
column = connect_string(column, ms)
print(u'数目:%s...' % len(column))
# 新一批支持度的计算函数
def sf(i):
return d[i].prod(axis=1, numeric_only=True)
# 创建连接数据,这一步耗时、耗内存最严重。当数据集较大时,可以考虑并行运算优化。
d_2 = pd.DataFrame(list(map(sf, column)), index=[
ms.join(i) for i in column]).T
# 计算连接后的支持度
support_series_2 = 1.0 * \
d_2[[ms.join(i) for i in column]].sum() / len(d)
column = list(
support_series_2[support_series_2 > support].index) # 新一轮支持度筛选
support_series = support_series.append(support_series_2)
column2 = []
for i in column: # 遍历可能的推理,如{A,B,C}究竟是A+B-->C还是B+C-->A还是C+A-->B?
i = i.split(ms)
for j in range(len(i)):
column2.append(i[:j] + i[j + 1:] + i[j:j + 1])
cofidence_series = pd.Series(
index=[ms.join(i) for i in column2]) # 定义置信度序列
# 计算置信度序列
for i in column2:
cofidence_series[ms.join(i)] = support_series[ms.join(
sorted(i))] / support_series[ms.join(i[:len(i) - 1])]
# 置信度筛选
for i in cofidence_series[cofidence_series > confidence].index:
result[i] = 0.0
result[i]['confidence'] = cofidence_series[i]
result[i]['support'] = support_series[ms.join(sorted(i.split(ms)))]
result = result.T.sort_values(['confidence', 'support'],
ascending=False) # 结果整理,输出
print(u'\n结果为:')
print(result)
return result
def programmer_8():
inputfile = 'data/menu_orders.xls'
outputfile = 'tmp/apriori_rules.xls'
data = pd.read_excel(inputfile, header=None)
print(u'\n转换原始数据至0-1矩阵...')
# 转换0-1矩阵的过渡函数
def ct(x):
return pd.Series(1, index=x[pd.notnull(x)])
b = map(ct, data.as_matrix())
data = pd.DataFrame(list(b)).fillna(0)
print(u'\n转换完毕。')
del b # 删除中间变量b,节省内存
support = 0.2 # 最小支持度
confidence = 0.5 # 最小置信度
ms = '---' # 连接符,默认'--',用来区分不同元素,如A--B。需要保证原始表格中不含有该字符
find_rule(data, support, confidence, ms).to_excel(outputfile)
if __name__ == "__main__":
# programmer_1()
# programmer_2()
# programmer_3()
# data_zs, r = programmer_4()
# programmer_5(data_zs, r)
# programmer_6()
# programmer_7()
# programmer_8()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册