ch06

445f9129 · hathackerwang · 09a89e7f · 445f9129 · 445f9129 · 445f9129
14 changed file
--- a/ch06-Ada_boost/ROC_plot.py
+++ b/ch06-Ada_boost/ROC_plot.py
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jul 13 13:02:49 2018
+
+@author: Administrator
+"""
+
+def plotROC(predStrengths, classLabels):
+    import matplotlib.pyplot as plt
+    cur = (1.0,1.0) #cursor
+    ySum = 0.0 #variable to calculate AUC
+    numPosClas = sum(array(classLabels)==1.0)
+    yStep = 1/float(numPosClas); xStep = 1/float(len(classLabels)-numPosClas)
+    sortedIndicies = predStrengths.argsort()#get sorted index, it's reverse
+    fig = plt.figure()
+    fig.clf()
+    ax = plt.subplot(111)
+    #loop through all the values, drawing a line segment at each point
+    for index in sortedIndicies.tolist()[0]:
+        if classLabels[index] == 1.0:
+            delX = 0; delY = yStep;
+        else:
+            delX = xStep; delY = 0;
+            ySum += cur[1]
+        #draw line from cur to (cur[0]-delX,cur[1]-delY)
+        ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY], c='b')
+        cur = (cur[0]-delX,cur[1]-delY)
+    ax.plot([0,1],[0,1],'b--')
+    plt.xlabel('False positive rate'); plt.ylabel('True positive rate')
+    plt.title('ROC curve for AdaBoost horse colic detection system')
+    ax.axis([0,1,0,1])
+    plt.show()
+    print ("the Area Under the Curve is: ",ySum*xStep)
--- a/ch06-Ada_boost/Stump_classify.py
+++ b/ch06-Ada_boost/Stump_classify.py
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jul 13 12:54:16 2018
+
+@author: Administrator
+"""
+
+import copy
+from numpy import *
+from math import inf
+import numpy as np
+"""
+#构建单层分类器
+#单层分类器是基于最小加权分类错误率的树桩
+#伪代码
+#将最小错误率minError设为+∞
+#对数据集中的每个特征(第一层特征)：
+    #对每个步长(第二层特征)：
+        #对每个不等号(第三层特征)：
+            #建立一颗单层决策树并利用加权数据集对它进行测试
+            #如果错误率低于minError，则将当前单层决策树设为最佳单层决策树
+#返回最佳单层决策树
+"""
+def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#just classify the data
+    retArray = ones((shape(dataMatrix)[0],1))
+    if threshIneq == 'lt':
+        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
+    else:
+        retArray[dataMatrix[:,dimen] > threshVal] = -1.0
+    return retArray
+        
+def buildStump(dataArr,classLabels,D):
+    dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
+    m,n = shape(dataMatrix)
+    numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m,1)))
+    minError = inf #init error sum, to +infinity
+    for i in range(n):#loop over all dimensions
+        rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
+        stepSize = (rangeMax-rangeMin)/numSteps
+        for j in range(-1,int(numSteps)+1):#loop over all range in current dimension
+            for inequal in ['lt', 'gt']: #go over less than and greater than
+                threshVal = (rangeMin + float(j) * stepSize)
+                predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)#call stump classify with i, j, lessThan
+                errArr = mat(ones((m,1)))
+                errArr[predictedVals == labelMat] = 0
+                weightedError = D.T*errArr  #calc total error multiplied by D
+                #print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
+                if weightedError < minError:
+                    minError = weightedError
+                    bestClasEst = predictedVals.copy()
+                    bestStump['dim'] = i
+                    bestStump['thresh'] = threshVal
+                    bestStump['ineq'] = inequal
+    return bestStump,minError,bestClasEst
--- a/ch06-Ada_boost/Test_classify.py
+++ b/ch06-Ada_boost/Test_classify.py
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jul 13 12:59:35 2018
+
+@author: Administrator
+"""
+
+from Stump_classify import *
+from numpy import *
+#测试adaBoost，adaBoost分类函数
+#@datToClass:测试数据点
+#@classifierArr：构建好的最终分类器
+def adaClassify(datToClass,classifierArr):
+    #构建数据向量或矩阵
+    dataMatrix=mat(datToClass)
+    #获取矩阵行数
+    m=shape(dataMatrix)[0]
+    #初始化最终分类器
+    aggClassEst=mat(zeros((m,1)))
+    #遍历分类器列表中的每一个弱分类器
+    for i in range(len(classifierArr)):
+        #每一个弱分类器对测试数据进行预测分类
+        classEst=stumpClassify(dataMatrix,classifierArr[i]['dim'],\
+                                classifierArr[i]['thresh'],
+                                classifierArr[i]['ineq'])
+        #对各个分类器的预测结果进行加权累加
+        aggClassEst+=classifierArr[i]['alpha']*classEst
+        print('aggClassEst',aggClassEst)
+    #通过sign函数根据结果大于或小于0预测出+1或-1
+    return sign(aggClassEst)
+
+def loadDataSet(filename):
+    #创建数据集矩阵，标签向量
+    dataMat=[];labelMat=[]
+    #获取特征数目(包括最后一类标签)
+    #readline():读取文件的一行
+    #readlines:读取整个文件所有行
+    numFeat=len(open(filename).readline().split('\t'))
+    #打开文件
+    fr=open(filename)
+    #遍历文本每一行
+    for line in fr.readlines():
+        lineArr=[]
+        curLine=line.strip().split('\t')
+        for i in range(numFeat-1):
+            lineArr.append(float(curLine[i]))
+        #数据矩阵
+        dataMat.append(lineArr)
+        #标签向量
+        labelMat.append(float(curLine[-1]))
+    return dataMat,labelMat
+
+#训练和测试分类器
+def classify():
+    #利用训练集训练分类器
+    datArr,labelArr=loadDataSet('horseColicTraining.txt')
+    #得到训练好的分类器
+    classifierArray=adaBoostTrainDS(datArr,labelArr,10)
+    #利用测试集测试分类器的分类效果
+    testArr,testLabelArr=loadDataSet('horseColicTest.txt')
+    prediction=adaClassify(testArr,classifierArray)
+    #输出错误率
+    num=shape(mat(labelArr))[1]
+    errArr=mat(ones((num,1)))
+    error=errArr[prediction!=mat(testLabelArr).T].sum()
+    print("the errorRate is: %.2f",errorRate=float(error)/float((num)))
\ No newline at end of file
--- a/ch06-Ada_boost/Train_adaboost.py
+++ b/ch06-Ada_boost/Train_adaboost.py
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jul 13 12:55:18 2018
+
+@author: Administrator
+"""
+
+from numpy import *
+from Stump_classify import *
+#adaBoost算法
+#@dataArr：数据矩阵
+#@classLabels:标签向量
+#@numIt:迭代次数    
+def adaBoostTrainDS(dataArr,classLabels,numIt=40):
+    '''
+    @adaBoost算法
+    @dataArr：数据矩阵
+    @classLabels:标签向量
+    @numIt:迭代次数  
+    '''
+    #弱分类器相关信息列表
+    weakClassArr=[]
+    #获取数据集行数
+    m=shape(dataArr)[0]
+    #初始化权重向量的每一项值相等
+    D=mat(ones((m,1))/m)
+    #累计估计值向量
+    aggClassEst=mat((m,1))
+    #循环迭代次数
+    for i in range(numIt):
+        #根据当前数据集，标签及权重建立最佳单层决策树
+        bestStump,error,classEst=buildStump(dataArr,classLabels,D)
+        #打印权重向量
+        print("D:",D.T)
+        #求单层决策树的系数alpha
+        alpha=float(0.5*log((1.0-error)/(max(error,1e-16))))
+        #存储决策树的系数alpha到字典
+        bestStump['alpha']=alpha
+        #将该决策树存入列表
+        weakClassArr.append(bestStump)
+        #打印决策树的预测结果
+        print("classEst:",classEst.T)
+        #预测正确为exp(-alpha),预测错误为exp(alpha)
+        #即增大分类错误样本的权重，减少分类正确的数据点权重
+        expon=multiply(-1*alpha*mat(classLabels).T,classEst)
+        #更新权值向量
+        D=multiply(D,exp(expon))
+        D=D/D.sum()
+        #累加当前单层决策树的加权预测值
+        aggClassEst = aggClassEst + alpha * classEst
+        #aggClassEst = array(aggClassEst)
+        print("aggClassEst",aggClassEst.T)
+        #求出分类错的样本个数
+        aggErrors=multiply(sign(aggClassEst)!=\
+                    mat(classLabels).T,ones((m,1)))
+        #计算错误率
+        errorRate=aggErrors.sum()/m
+        print("total error:",errorRate,"\n")
+        #错误率为0.0退出循环
+        if errorRate==0.0:break
+    #返回弱分类器的组合列表
+    return weakClassArr
\ No newline at end of file
--- a/ch06-Ada_boost/__init__.py
+++ b/ch06-Ada_boost/__init__.py
+#
\ No newline at end of file
--- a/ch06-Ada_boost/__main__.py
+++ b/ch06-Ada_boost/__main__.py
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jul 13 12:21:58 2018
+
+@author: Administrator
+"""
+from numpy import *
+from Stump_classify import *
+from Test_classify import *
+from Train_adaboost import *
+from ROC_plot import *
+
+
+def loadSimpData():
+    dataMat=matrix([[1. ,2.1],
+        [2. ,1.1],
+        [1.3,1. ],
+        [1. ,1. ],
+        [2. ,1. ]])
+    classLabels=[1.0,1.0,-1.0,-1.0,1.0]
+    return dataMat,classLabels
+
+if __name__ == '__main__':
+    data, label = loadSimpData()
+    datArr,labelArr = loadDataSet('horseColicTraining.txt')
+    classifierArr = adaBoostTrainDS(datArr,labelArr,9)
+    testArr,testLabelArr = loadDataSet('horseColicTest.txt')
+    prediciton = adaClassify(testArr,classifierArr)
+
+    error = mat(ones((67,1)))
+    error[prediciton != mat(testLabelArr ).T] .sum() 
\ No newline at end of file
--- a/ch06-Ada_boost/__pycache__/ROC_plot.cpython-36.pyc
+++ b/ch06-Ada_boost/__pycache__/ROC_plot.cpython-36.pyc
--- a/ch06-Ada_boost/__pycache__/Stump_classify.cpython-36.pyc
+++ b/ch06-Ada_boost/__pycache__/Stump_classify.cpython-36.pyc
--- a/ch06-Ada_boost/__pycache__/Test_classify.cpython-36.pyc
+++ b/ch06-Ada_boost/__pycache__/Test_classify.cpython-36.pyc
--- a/ch06-Ada_boost/__pycache__/Train_adaboost.cpython-36.pyc
+++ b/ch06-Ada_boost/__pycache__/Train_adaboost.cpython-36.pyc
--- a/ch06-Ada_boost/__pycache__/__init__.cpython-36.pyc
+++ b/ch06-Ada_boost/__pycache__/__init__.cpython-36.pyc
--- a/ch06-Ada_boost/__pycache__/test.cpython-36.pyc
+++ b/ch06-Ada_boost/__pycache__/test.cpython-36.pyc
--- a/ch06-Ada_boost/horseColicTest.txt
+++ b/ch06-Ada_boost/horseColicTest.txt
+2	1	38.50	54	20	0	1	2	2	3	4	1	2	2	5.90	0	2	42.00	6.30	0	0	1
+2	1	37.60	48	36	0	0	1	1	0	3	0	0	0	0	0	0	44.00	6.30	1	5.00	1
+1	1	37.7	44	28	0	4	3	2	5	4	4	1	1	0	3	5	45	70	3	2	1
+1	1	37	56	24	3	1	4	2	4	4	3	1	1	0	0	0	35	61	3	2	0
+2	1	38.00	42	12	3	0	3	1	1	0	1	0	0	0	0	2	37.00	5.80	0	0	1
+1	1	0	60	40	3	0	1	1	0	4	0	3	2	0	0	5	42	72	0	0	1
+2	1	38.40	80	60	3	2	2	1	3	2	1	2	2	0	1	1	54.00	6.90	0	0	1
+2	1	37.80	48	12	2	1	2	1	3	0	1	2	0	0	2	0	48.00	7.30	1	0	1
+2	1	37.90	45	36	3	3	3	2	2	3	1	2	1	0	3	0	33.00	5.70	3	0	1
+2	1	39.00	84	12	3	1	5	1	2	4	2	1	2	7.00	0	4	62.00	5.90	2	2.20	0
+2	1	38.20	60	24	3	1	3	2	3	3	2	3	3	0	4	4	53.00	7.50	2	1.40	1
+1	1	0	140	0	0	0	4	2	5	4	4	1	1	0	0	5	30	69	0	0	0
+1	1	37.90	120	60	3	3	3	1	5	4	4	2	2	7.50	4	5	52.00	6.60	3	1.80	0
+2	1	38.00	72	36	1	1	3	1	3	0	2	2	1	0	3	5	38.00	6.80	2	2.00	1
+2	9	38.00	92	28	1	1	2	1	1	3	2	3	0	7.20	0	0	37.00	6.10	1	1.10	1
+1	1	38.30	66	30	2	3	1	1	2	4	3	3	2	8.50	4	5	37.00	6.00	0	0	1
+2	1	37.50	48	24	3	1	1	1	2	1	0	1	1	0	3	2	43.00	6.00	1	2.80	1
+1	1	37.50	88	20	2	3	3	1	4	3	3	0	0	0	0	0	35.00	6.40	1	0	0
+2	9	0	150	60	4	4	4	2	5	4	4	0	0	0	0	0	0	0	0	0	0
+1	1	39.7	100	30	0	0	6	2	4	4	3	1	0	0	4	5	65	75	0	0	0
+1	1	38.30	80	0	3	3	4	2	5	4	3	2	1	0	4	4	45.00	7.50	2	4.60	1
+2	1	37.50	40	32	3	1	3	1	3	2	3	2	1	0	0	5	32.00	6.40	1	1.10	1
+1	1	38.40	84	30	3	1	5	2	4	3	3	2	3	6.50	4	4	47.00	7.50	3	0	0
+1	1	38.10	84	44	4	0	4	2	5	3	1	1	3	5.00	0	4	60.00	6.80	0	5.70	0
+2	1	38.70	52	0	1	1	1	1	1	3	1	0	0	0	1	3	4.00	74.00	0	0	1
+2	1	38.10	44	40	2	1	3	1	3	3	1	0	0	0	1	3	35.00	6.80	0	0	1
+2	1	38.4	52	20	2	1	3	1	1	3	2	2	1	0	3	5	41	63	1	1	1
+1	1	38.20	60	0	1	0	3	1	2	1	1	1	1	0	4	4	43.00	6.20	2	3.90	1
+2	1	37.70	40	18	1	1	1	0	3	2	1	1	1	0	3	3	36.00	3.50	0	0	1
+1	1	39.1	60	10	0	1	1	0	2	3	0	0	0	0	4	4	0	0	0	0	1
+2	1	37.80	48	16	1	1	1	1	0	1	1	2	1	0	4	3	43.00	7.50	0	0	1
+1	1	39.00	120	0	4	3	5	2	2	4	3	2	3	8.00	0	0	65.00	8.20	3	4.60	1
+1	1	38.20	76	0	2	3	2	1	5	3	3	1	2	6.00	1	5	35.00	6.50	2	0.90	1
+2	1	38.30	88	0	0	0	6	0	0	0	0	0	0	0	0	0	0	0	0	0	0
+1	1	38.00	80	30	3	3	3	1	0	0	0	0	0	6.00	0	0	48.00	8.30	0	4.30	1
+1	1	0	0	0	3	1	1	1	2	3	3	1	3	6.00	4	4	0	0	2	0	0
+1	1	37.60	40	0	1	1	1	1	1	1	1	0	0	0	1	1	0	0	2	2.10	1
+2	1	37.50	44	0	1	1	1	1	3	3	2	0	0	0	0	0	45.00	5.80	2	1.40	1
+2	1	38.2	42	16	1	1	3	1	1	3	1	0	0	0	1	0	35	60	1	1	1
+2	1	38	56	44	3	3	3	0	0	1	1	2	1	0	4	0	47	70	2	1	1
+2	1	38.30	45	20	3	3	2	2	2	4	1	2	0	0	4	0	0	0	0	0	1
+1	1	0	48	96	1	1	3	1	0	4	1	2	1	0	1	4	42.00	8.00	1	0	1
+1	1	37.70	55	28	2	1	2	1	2	3	3	0	3	5.00	4	5	0	0	0	0	1
+2	1	36.00	100	20	4	3	6	2	2	4	3	1	1	0	4	5	74.00	5.70	2	2.50	0
+1	1	37.10	60	20	2	0	4	1	3	0	3	0	2	5.00	3	4	64.00	8.50	2	0	1
+2	1	37.10	114	40	3	0	3	2	2	2	1	0	0	0	0	3	32.00	0	3	6.50	1
+1	1	38.1	72	30	3	3	3	1	4	4	3	2	1	0	3	5	37	56	3	1	1
+1	1	37.00	44	12	3	1	1	2	1	1	1	0	0	0	4	2	40.00	6.70	3	8.00	1
+1	1	38.6	48	20	3	1	1	1	4	3	1	0	0	0	3	0	37	75	0	0	1
+1	1	0	82	72	3	1	4	1	2	3	3	0	3	0	4	4	53	65	3	2	0
+1	9	38.20	78	60	4	4	6	0	3	3	3	0	0	0	1	0	59.00	5.80	3	3.10	0
+2	1	37.8	60	16	1	1	3	1	2	3	2	1	2	0	3	0	41	73	0	0	0
+1	1	38.7	34	30	2	0	3	1	2	3	0	0	0	0	0	0	33	69	0	2	0
+1	1	0	36	12	1	1	1	1	1	2	1	1	1	0	1	5	44.00	0	0	0	1
+2	1	38.30	44	60	0	0	1	1	0	0	0	0	0	0	0	0	6.40	36.00	0	0	1
+2	1	37.40	54	18	3	0	1	1	3	4	3	2	2	0	4	5	30.00	7.10	2	0	1
+1	1	0	0	0	4	3	0	2	2	4	1	0	0	0	0	0	54	76	3	2	1
+1	1	36.6	48	16	3	1	3	1	4	1	1	1	1	0	0	0	27	56	0	0	0
+1	1	38.5	90	0	1	1	3	1	3	3	3	2	3	2	4	5	47	79	0	0	1
+1	1	0	75	12	1	1	4	1	5	3	3	0	3	5.80	0	0	58.00	8.50	1	0	1
+2	1	38.20	42	0	3	1	1	1	1	1	2	2	1	0	3	2	35.00	5.90	2	0	1
+1	9	38.20	78	60	4	4	6	0	3	3	3	0	0	0	1	0	59.00	5.80	3	3.10	0
+2	1	38.60	60	30	1	1	3	1	4	2	2	1	1	0	0	0	40.00	6.00	1	0	1
+2	1	37.80	42	40	1	1	1	1	1	3	1	0	0	0	3	3	36.00	6.20	0	0	1
+1	1	38	60	12	1	1	2	1	2	1	1	1	1	0	1	4	44	65	3	2	0
+2	1	38.00	42	12	3	0	3	1	1	1	1	0	0	0	0	1	37.00	5.80	0	0	1
+2	1	37.60	88	36	3	1	1	1	3	3	2	1	3	1.50	0	0	44.00	6.00	0	0	0
\ No newline at end of file
--- a/ch06-Ada_boost/horseColicTraining.txt
+++ b/ch06-Ada_boost/horseColicTraining.txt