ch04

36bcca2c · hathackerwang · f31712e5 · 36bcca2c · 36bcca2c · 36bcca2c
9 changed file
--- a/ch04-Logistic_regression/Grad_descent.py
+++ b/ch04-Logistic_regression/Grad_descent.py
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jul  8 13:06:53 2018
+
+@author: Administrator
+"""
+
+from numpy import *
+
+def loadData(filename):
+    '''
+    '''
+    datamat = []; labelmat = []
+    with open(filename) as fr:
+        for line in fr.readlines():
+            line_arr = line.strip().split()
+            datamat.append([1.0, float(line_arr[0]), float(line_arr[1])])
+            # jisuan x0, x1,x2. x0wei 1
+            labelmat.append(int(line_arr[2]))
+    return datamat, labelmat
+
+def sigmoid(inp):
+    return 1.0 / (1 + exp(-inp))
+
+def Grad_descent(datamat, labels):
+    '''
+    
+    '''
+    data = mat(datamat)
+    label = mat(labels).transpose()
+    
+    m, n = shape(datamat)
+    alpha = 0.001; max_iter = 500
+    weights = ones((n, 1))   # 
+    for k in range(max_iter):
+        # 
+        z = dot(datamat, weights)
+        y_pred = sigmoid(z)
+        error = (label - y_pred)
+        # grad(x) = (y - f(x)) * x'
+        weights = weights + alpha * data.transpose() * error
+    return weights
+    
+    
+    
\ No newline at end of file
--- a/ch04-Logistic_regression/Logistic_classify.py
+++ b/ch04-Logistic_regression/Logistic_classify.py
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jul  8 15:11:21 2018
+
+@author: Administrator
+"""
+
+from numpy import *
+from Grad_descent import *
+from Random_GDS import Stoch_gdescent
+
+def classifyVec(inp, weights):
+    prob = sigmoid(sum(list(array(inp) * array(weights))))
+    if prob > 0.5: return 1.0
+    else: return 0.0
+    
+#logistic回归预测算法
+def colicTest():
+    # 打开训练数据集
+    frTrain=open('horseColicTraining.txt')
+    # 打开测试数据集
+    frTest=open('horseColicTest.txt')
+
+    trainingSet=[];trainingLabels=[]
+    # 读取训练集文档的每一行
+    for line in frTrain.readlines():
+        # 对当前行进行特征分割
+        currLine=line.strip().split()
+        # 新建列表存储每个样本的特征向量
+        lineArr=[]
+        # 遍历每个样本的特征
+        for i in range(21):
+            # 将该样本的特征存入lineArr列表
+            lineArr.append(float(currLine[i]))
+        #将该样本标签存入标签列表
+        trainingLabels.append(currLine[21])
+        #将该样本的特征向量添加到数据集列表
+        trainingSet.append(lineArr)
+    #调用随机梯度上升法更新logistic回归的权值参数
+    trainWeights=Stoch_gdescent(trainingSet,trainingLabels,500)
+    #统计测试数据集预测错误样本数量和样本总数
+    errorCount=0; numTestVec=0.0
+    #遍历测试数据集的每个样本
+    for line in frTest.readlines():
+        #样本总数加1
+        numTestVec+=1.0
+        #对当前行进行处理，分割出各个特征及样本标签
+        currLine=line.strip().split()
+        #新建特征向量
+        lineArr=[]
+        #将各个特征构成特征向量
+        for i in range(21):
+            lineArr.append(float(currLine[i]))
+        #利用分类预测函数对该样本进行预测，并与样本标签进行比较
+        if(classifyVec(lineArr,trainWeights)!=currLine[21]):
+            #如果预测错误，错误数加1
+            errorCount+=1
+    #计算测试集总的预测错误率
+    errorRate=(float(errorCount)/numTestVec)
+    #打印错误率大小
+    print('the error rate of this test is: %f' % (errorRate))    
+    #返回错误率
+    return errorRate
+
+#多次测试算法求取预测误差平均值
+def multTest():
+    #设置测试次数为10次，并统计错误率总和
+    numTests=10;errorRateSum=0.0
+    #每一次测试算法并统计错误率
+    for k in range(numTests):
+        errorRateSum+=colicTest()
+    #打印出测试10次预测错误率平均值
+    print('after %d iterations the average error rate is: %f' \
+     % (numTests,errorRateSum / float(numTests)))
\ No newline at end of file
--- a/ch04-Logistic_regression/Plot_boundary.py
+++ b/ch04-Logistic_regression/Plot_boundary.py
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jul  8 13:51:17 2018
+
+@author: Administrator
+"""
+
+import matplotlib
+import matplotlib.pyplot as plt
+from numpy import *
+
+def plot_fit(data, labelMat, weights):
+    dataArr = array(data)
+    n = shape(dataArr)[0]
+    
+    x_cord1 = []; y_cord1 = []
+    x_cord2 = []; y_cord2 = []
+    for i in range(n):  
+        if int(labelMat[i]) == 1:  
+            x_cord1.append(dataArr[i,1]); y_cord1.append(dataArr[i,2])  
+        else: x_cord2.append(dataArr[i,1]); y_cord2.append(dataArr[i,2])  
+    
+    fig = plt.figure()
+    ax = fig.add_subplot(111)  
+    ax.scatter(x_cord1, y_cord1, s = 30, c = 'red', marker='s')  
+    ax.scatter(x_cord2, y_cord2, s = 30, c = 'green')  
+    
+    x = arange(-3.0, 3.0, 0.1)  
+    y = ((-weights[0]- weights[1] * x)/weights[2]).transpose()
+    ax.plot(x, y)  
+    plt.xlabel('X1');  
+    plt.ylabel('X2');  
+    plt.show()  
\ No newline at end of file
--- a/ch04-Logistic_regression/Random_GDS.py
+++ b/ch04-Logistic_regression/Random_GDS.py
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jul  8 14:30:25 2018
+
+@author: Administrator
+"""
+
+from numpy import *
+from Grad_descent import sigmoid
+
+def Stoch_gdescent(datamat, labels, num_iter = 150):
+    '''
+    基于样本集中的每个样本（随机抽取）进行迭代
+    求出优化的参数，并在此基础上对alpha进行衰减
+    '''
+    m, n = shape(datamat)
+    alpha = 0.01
+    weights = ones(n)   #
+    
+    for j in range(num_iter):
+        for i in range(m):
+            #j << x时衰减效果受到影响，0.01则为了保存一定的速率
+            alpha = 4 / (1.0 + j + i) + 0.01
+            randidx = int(random.uniform(0,len(range(m))))
+            
+            z = sum(datamat[randidx] * weights)
+            y_pred = sigmoid(z)
+            error = float(labels[randidx]) - y_pred
+            # grad(x) = (y - f(x)) * x'为迭代公式（梯度）
+            weights = weights + (alpha * error) * array(datamat[randidx])
+    return weights
\ No newline at end of file
--- a/ch04-Logistic_regression/__init__.py
+++ b/ch04-Logistic_regression/__init__.py
+#
\ No newline at end of file
--- a/ch04-Logistic_regression/__main__.py
+++ b/ch04-Logistic_regression/__main__.py
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jul  8 13:05:29 2018
+
+@author: Administrator
+"""
+
+from numpy import *
+from Grad_descent import *
+from Plot_boundary import *
+from matplotlib import *
+from Random_GDS import *
+from Logistic_classify import *
+
+if __name__ == '__main__':
+    data, label = loadData('testSet.txt')
+    print(Stoch_gdescent(data, label))
+    
+    weights = Stoch_gdescent(data, label)
+    plot_fit(data, label, weights)
+    
+    multTest()
\ No newline at end of file
--- a/ch04-Logistic_regression/horseColicTest.txt
+++ b/ch04-Logistic_regression/horseColicTest.txt
+2	1	38.50	54	20	0	1	2	2	3	4	1	2	2	5.90	0	2	42.00	6.30	0	0	1
+2	1	37.60	48	36	0	0	1	1	0	3	0	0	0	0	0	0	44.00	6.30	1	5.00	1
+1	1	37.7	44	28	0	4	3	2	5	4	4	1	1	0	3	5	45	70	3	2	1
+1	1	37	56	24	3	1	4	2	4	4	3	1	1	0	0	0	35	61	3	2	0
+2	1	38.00	42	12	3	0	3	1	1	0	1	0	0	0	0	2	37.00	5.80	0	0	1
+1	1	0	60	40	3	0	1	1	0	4	0	3	2	0	0	5	42	72	0	0	1
+2	1	38.40	80	60	3	2	2	1	3	2	1	2	2	0	1	1	54.00	6.90	0	0	1
+2	1	37.80	48	12	2	1	2	1	3	0	1	2	0	0	2	0	48.00	7.30	1	0	1
+2	1	37.90	45	36	3	3	3	2	2	3	1	2	1	0	3	0	33.00	5.70	3	0	1
+2	1	39.00	84	12	3	1	5	1	2	4	2	1	2	7.00	0	4	62.00	5.90	2	2.20	0
+2	1	38.20	60	24	3	1	3	2	3	3	2	3	3	0	4	4	53.00	7.50	2	1.40	1
+1	1	0	140	0	0	0	4	2	5	4	4	1	1	0	0	5	30	69	0	0	0
+1	1	37.90	120	60	3	3	3	1	5	4	4	2	2	7.50	4	5	52.00	6.60	3	1.80	0
+2	1	38.00	72	36	1	1	3	1	3	0	2	2	1	0	3	5	38.00	6.80	2	2.00	1
+2	9	38.00	92	28	1	1	2	1	1	3	2	3	0	7.20	0	0	37.00	6.10	1	1.10	1
+1	1	38.30	66	30	2	3	1	1	2	4	3	3	2	8.50	4	5	37.00	6.00	0	0	1
+2	1	37.50	48	24	3	1	1	1	2	1	0	1	1	0	3	2	43.00	6.00	1	2.80	1
+1	1	37.50	88	20	2	3	3	1	4	3	3	0	0	0	0	0	35.00	6.40	1	0	0
+2	9	0	150	60	4	4	4	2	5	4	4	0	0	0	0	0	0	0	0	0	0
+1	1	39.7	100	30	0	0	6	2	4	4	3	1	0	0	4	5	65	75	0	0	0
+1	1	38.30	80	0	3	3	4	2	5	4	3	2	1	0	4	4	45.00	7.50	2	4.60	1
+2	1	37.50	40	32	3	1	3	1	3	2	3	2	1	0	0	5	32.00	6.40	1	1.10	1
+1	1	38.40	84	30	3	1	5	2	4	3	3	2	3	6.50	4	4	47.00	7.50	3	0	0
+1	1	38.10	84	44	4	0	4	2	5	3	1	1	3	5.00	0	4	60.00	6.80	0	5.70	0
+2	1	38.70	52	0	1	1	1	1	1	3	1	0	0	0	1	3	4.00	74.00	0	0	1
+2	1	38.10	44	40	2	1	3	1	3	3	1	0	0	0	1	3	35.00	6.80	0	0	1
+2	1	38.4	52	20	2	1	3	1	1	3	2	2	1	0	3	5	41	63	1	1	1
+1	1	38.20	60	0	1	0	3	1	2	1	1	1	1	0	4	4	43.00	6.20	2	3.90	1
+2	1	37.70	40	18	1	1	1	0	3	2	1	1	1	0	3	3	36.00	3.50	0	0	1
+1	1	39.1	60	10	0	1	1	0	2	3	0	0	0	0	4	4	0	0	0	0	1
+2	1	37.80	48	16	1	1	1	1	0	1	1	2	1	0	4	3	43.00	7.50	0	0	1
+1	1	39.00	120	0	4	3	5	2	2	4	3	2	3	8.00	0	0	65.00	8.20	3	4.60	1
+1	1	38.20	76	0	2	3	2	1	5	3	3	1	2	6.00	1	5	35.00	6.50	2	0.90	1
+2	1	38.30	88	0	0	0	6	0	0	0	0	0	0	0	0	0	0	0	0	0	0
+1	1	38.00	80	30	3	3	3	1	0	0	0	0	0	6.00	0	0	48.00	8.30	0	4.30	1
+1	1	0	0	0	3	1	1	1	2	3	3	1	3	6.00	4	4	0	0	2	0	0
+1	1	37.60	40	0	1	1	1	1	1	1	1	0	0	0	1	1	0	0	2	2.10	1
+2	1	37.50	44	0	1	1	1	1	3	3	2	0	0	0	0	0	45.00	5.80	2	1.40	1
+2	1	38.2	42	16	1	1	3	1	1	3	1	0	0	0	1	0	35	60	1	1	1
+2	1	38	56	44	3	3	3	0	0	1	1	2	1	0	4	0	47	70	2	1	1
+2	1	38.30	45	20	3	3	2	2	2	4	1	2	0	0	4	0	0	0	0	0	1
+1	1	0	48	96	1	1	3	1	0	4	1	2	1	0	1	4	42.00	8.00	1	0	1
+1	1	37.70	55	28	2	1	2	1	2	3	3	0	3	5.00	4	5	0	0	0	0	1
+2	1	36.00	100	20	4	3	6	2	2	4	3	1	1	0	4	5	74.00	5.70	2	2.50	0
+1	1	37.10	60	20	2	0	4	1	3	0	3	0	2	5.00	3	4	64.00	8.50	2	0	1
+2	1	37.10	114	40	3	0	3	2	2	2	1	0	0	0	0	3	32.00	0	3	6.50	1
+1	1	38.1	72	30	3	3	3	1	4	4	3	2	1	0	3	5	37	56	3	1	1
+1	1	37.00	44	12	3	1	1	2	1	1	1	0	0	0	4	2	40.00	6.70	3	8.00	1
+1	1	38.6	48	20	3	1	1	1	4	3	1	0	0	0	3	0	37	75	0	0	1
+1	1	0	82	72	3	1	4	1	2	3	3	0	3	0	4	4	53	65	3	2	0
+1	9	38.20	78	60	4	4	6	0	3	3	3	0	0	0	1	0	59.00	5.80	3	3.10	0
+2	1	37.8	60	16	1	1	3	1	2	3	2	1	2	0	3	0	41	73	0	0	0
+1	1	38.7	34	30	2	0	3	1	2	3	0	0	0	0	0	0	33	69	0	2	0
+1	1	0	36	12	1	1	1	1	1	2	1	1	1	0	1	5	44.00	0	0	0	1
+2	1	38.30	44	60	0	0	1	1	0	0	0	0	0	0	0	0	6.40	36.00	0	0	1
+2	1	37.40	54	18	3	0	1	1	3	4	3	2	2	0	4	5	30.00	7.10	2	0	1
+1	1	0	0	0	4	3	0	2	2	4	1	0	0	0	0	0	54	76	3	2	1
+1	1	36.6	48	16	3	1	3	1	4	1	1	1	1	0	0	0	27	56	0	0	0
+1	1	38.5	90	0	1	1	3	1	3	3	3	2	3	2	4	5	47	79	0	0	1
+1	1	0	75	12	1	1	4	1	5	3	3	0	3	5.80	0	0	58.00	8.50	1	0	1
+2	1	38.20	42	0	3	1	1	1	1	1	2	2	1	0	3	2	35.00	5.90	2	0	1
+1	9	38.20	78	60	4	4	6	0	3	3	3	0	0	0	1	0	59.00	5.80	3	3.10	0
+2	1	38.60	60	30	1	1	3	1	4	2	2	1	1	0	0	0	40.00	6.00	1	0	1
+2	1	37.80	42	40	1	1	1	1	1	3	1	0	0	0	3	3	36.00	6.20	0	0	1
+1	1	38	60	12	1	1	2	1	2	1	1	1	1	0	1	4	44	65	3	2	0
+2	1	38.00	42	12	3	0	3	1	1	1	1	0	0	0	0	1	37.00	5.80	0	0	1
+2	1	37.60	88	36	3	1	1	1	3	3	2	1	3	1.50	0	0	44.00	6.00	0	0	0
\ No newline at end of file
--- a/ch04-Logistic_regression/horseColicTraining.txt
+++ b/ch04-Logistic_regression/horseColicTraining.txt
--- a/ch04-Logistic_regression/testSet.txt
+++ b/ch04-Logistic_regression/testSet.txt
+-0.017612	14.053064	0
+-1.395634	4.662541	1
+-0.752157	6.538620	0
+-1.322371	7.152853	0
+0.423363	11.054677	0
+0.406704	7.067335	1
+0.667394	12.741452	0
+-2.460150	6.866805	1
+0.569411	9.548755	0
+-0.026632	10.427743	0
+0.850433	6.920334	1
+1.347183	13.175500	0
+1.176813	3.167020	1
+-1.781871	9.097953	0
+-0.566606	5.749003	1
+0.931635	1.589505	1
+-0.024205	6.151823	1
+-0.036453	2.690988	1
+-0.196949	0.444165	1
+1.014459	5.754399	1
+1.985298	3.230619	1
+-1.693453	-0.557540	1
+-0.576525	11.778922	0
+-0.346811	-1.678730	1
+-2.124484	2.672471	1
+1.217916	9.597015	0
+-0.733928	9.098687	0
+-3.642001	-1.618087	1
+0.315985	3.523953	1
+1.416614	9.619232	0
+-0.386323	3.989286	1
+0.556921	8.294984	1
+1.224863	11.587360	0
+-1.347803	-2.406051	1
+1.196604	4.951851	1
+0.275221	9.543647	0
+0.470575	9.332488	0
+-1.889567	9.542662	0
+-1.527893	12.150579	0
+-1.185247	11.309318	0
+-0.445678	3.297303	1
+1.042222	6.105155	1
+-0.618787	10.320986	0
+1.152083	0.548467	1
+0.828534	2.676045	1
+-1.237728	10.549033	0
+-0.683565	-2.166125	1
+0.229456	5.921938	1
+-0.959885	11.555336	0
+0.492911	10.993324	0
+0.184992	8.721488	0
+-0.355715	10.325976	0
+-0.397822	8.058397	0
+0.824839	13.730343	0
+1.507278	5.027866	1
+0.099671	6.835839	1
+-0.344008	10.717485	0
+1.785928	7.718645	1
+-0.918801	11.560217	0
+-0.364009	4.747300	1
+-0.841722	4.119083	1
+0.490426	1.960539	1
+-0.007194	9.075792	0
+0.356107	12.447863	0
+0.342578	12.281162	0
+-0.810823	-1.466018	1
+2.530777	6.476801	1
+1.296683	11.607559	0
+0.475487	12.040035	0
+-0.783277	11.009725	0
+0.074798	11.023650	0
+-1.337472	0.468339	1
+-0.102781	13.763651	0
+-0.147324	2.874846	1
+0.518389	9.887035	0
+1.015399	7.571882	0
+-1.658086	-0.027255	1
+1.319944	2.171228	1
+2.056216	5.019981	1
+-0.851633	4.375691	1
+-1.510047	6.061992	0
+-1.076637	-3.181888	1
+1.821096	10.283990	0
+3.010150	8.401766	1
+-1.099458	1.688274	1
+-0.834872	-1.733869	1
+-0.846637	3.849075	1
+1.400102	12.628781	0
+1.752842	5.468166	1
+0.078557	0.059736	1
+0.089392	-0.715300	1
+1.825662	12.693808	0
+0.197445	9.744638	0
+0.126117	0.922311	1
+-0.679797	1.220530	1
+0.677983	2.556666	1
+0.761349	10.693862	0
+-2.168791	0.143632	1
+1.388610	9.341997	0
+0.317029	14.739025	0