提交 445f9129 编写于 作者: H hathackerwang

ch06

上级 09a89e7f
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 13 13:02:49 2018
@author: Administrator
"""
def plotROC(predStrengths, classLabels):
import matplotlib.pyplot as plt
cur = (1.0,1.0) #cursor
ySum = 0.0 #variable to calculate AUC
numPosClas = sum(array(classLabels)==1.0)
yStep = 1/float(numPosClas); xStep = 1/float(len(classLabels)-numPosClas)
sortedIndicies = predStrengths.argsort()#get sorted index, it's reverse
fig = plt.figure()
fig.clf()
ax = plt.subplot(111)
#loop through all the values, drawing a line segment at each point
for index in sortedIndicies.tolist()[0]:
if classLabels[index] == 1.0:
delX = 0; delY = yStep;
else:
delX = xStep; delY = 0;
ySum += cur[1]
#draw line from cur to (cur[0]-delX,cur[1]-delY)
ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY], c='b')
cur = (cur[0]-delX,cur[1]-delY)
ax.plot([0,1],[0,1],'b--')
plt.xlabel('False positive rate'); plt.ylabel('True positive rate')
plt.title('ROC curve for AdaBoost horse colic detection system')
ax.axis([0,1,0,1])
plt.show()
print ("the Area Under the Curve is: ",ySum*xStep)
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 13 12:54:16 2018
@author: Administrator
"""
import copy
from numpy import *
from math import inf
import numpy as np
"""
#构建单层分类器
#单层分类器是基于最小加权分类错误率的树桩
#伪代码
#将最小错误率minError设为+∞
#对数据集中的每个特征(第一层特征):
#对每个步长(第二层特征):
#对每个不等号(第三层特征):
#建立一颗单层决策树并利用加权数据集对它进行测试
#如果错误率低于minError,则将当前单层决策树设为最佳单层决策树
#返回最佳单层决策树
"""
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#just classify the data
retArray = ones((shape(dataMatrix)[0],1))
if threshIneq == 'lt':
retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:,dimen] > threshVal] = -1.0
return retArray
def buildStump(dataArr,classLabels,D):
dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
m,n = shape(dataMatrix)
numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m,1)))
minError = inf #init error sum, to +infinity
for i in range(n):#loop over all dimensions
rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
stepSize = (rangeMax-rangeMin)/numSteps
for j in range(-1,int(numSteps)+1):#loop over all range in current dimension
for inequal in ['lt', 'gt']: #go over less than and greater than
threshVal = (rangeMin + float(j) * stepSize)
predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)#call stump classify with i, j, lessThan
errArr = mat(ones((m,1)))
errArr[predictedVals == labelMat] = 0
weightedError = D.T*errArr #calc total error multiplied by D
#print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
if weightedError < minError:
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump,minError,bestClasEst
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 13 12:59:35 2018
@author: Administrator
"""
from Stump_classify import *
from numpy import *
#测试adaBoost,adaBoost分类函数
#@datToClass:测试数据点
#@classifierArr:构建好的最终分类器
def adaClassify(datToClass,classifierArr):
#构建数据向量或矩阵
dataMatrix=mat(datToClass)
#获取矩阵行数
m=shape(dataMatrix)[0]
#初始化最终分类器
aggClassEst=mat(zeros((m,1)))
#遍历分类器列表中的每一个弱分类器
for i in range(len(classifierArr)):
#每一个弱分类器对测试数据进行预测分类
classEst=stumpClassify(dataMatrix,classifierArr[i]['dim'],\
classifierArr[i]['thresh'],
classifierArr[i]['ineq'])
#对各个分类器的预测结果进行加权累加
aggClassEst+=classifierArr[i]['alpha']*classEst
print('aggClassEst',aggClassEst)
#通过sign函数根据结果大于或小于0预测出+1或-1
return sign(aggClassEst)
def loadDataSet(filename):
#创建数据集矩阵,标签向量
dataMat=[];labelMat=[]
#获取特征数目(包括最后一类标签)
#readline():读取文件的一行
#readlines:读取整个文件所有行
numFeat=len(open(filename).readline().split('\t'))
#打开文件
fr=open(filename)
#遍历文本每一行
for line in fr.readlines():
lineArr=[]
curLine=line.strip().split('\t')
for i in range(numFeat-1):
lineArr.append(float(curLine[i]))
#数据矩阵
dataMat.append(lineArr)
#标签向量
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
#训练和测试分类器
def classify():
#利用训练集训练分类器
datArr,labelArr=loadDataSet('horseColicTraining.txt')
#得到训练好的分类器
classifierArray=adaBoostTrainDS(datArr,labelArr,10)
#利用测试集测试分类器的分类效果
testArr,testLabelArr=loadDataSet('horseColicTest.txt')
prediction=adaClassify(testArr,classifierArray)
#输出错误率
num=shape(mat(labelArr))[1]
errArr=mat(ones((num,1)))
error=errArr[prediction!=mat(testLabelArr).T].sum()
print("the errorRate is: %.2f",errorRate=float(error)/float((num)))
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 13 12:55:18 2018
@author: Administrator
"""
from numpy import *
from Stump_classify import *
#adaBoost算法
#@dataArr:数据矩阵
#@classLabels:标签向量
#@numIt:迭代次数
def adaBoostTrainDS(dataArr,classLabels,numIt=40):
'''
@adaBoost算法
@dataArr:数据矩阵
@classLabels:标签向量
@numIt:迭代次数
'''
#弱分类器相关信息列表
weakClassArr=[]
#获取数据集行数
m=shape(dataArr)[0]
#初始化权重向量的每一项值相等
D=mat(ones((m,1))/m)
#累计估计值向量
aggClassEst=mat((m,1))
#循环迭代次数
for i in range(numIt):
#根据当前数据集,标签及权重建立最佳单层决策树
bestStump,error,classEst=buildStump(dataArr,classLabels,D)
#打印权重向量
print("D:",D.T)
#求单层决策树的系数alpha
alpha=float(0.5*log((1.0-error)/(max(error,1e-16))))
#存储决策树的系数alpha到字典
bestStump['alpha']=alpha
#将该决策树存入列表
weakClassArr.append(bestStump)
#打印决策树的预测结果
print("classEst:",classEst.T)
#预测正确为exp(-alpha),预测错误为exp(alpha)
#即增大分类错误样本的权重,减少分类正确的数据点权重
expon=multiply(-1*alpha*mat(classLabels).T,classEst)
#更新权值向量
D=multiply(D,exp(expon))
D=D/D.sum()
#累加当前单层决策树的加权预测值
aggClassEst = aggClassEst + alpha * classEst
#aggClassEst = array(aggClassEst)
print("aggClassEst",aggClassEst.T)
#求出分类错的样本个数
aggErrors=multiply(sign(aggClassEst)!=\
mat(classLabels).T,ones((m,1)))
#计算错误率
errorRate=aggErrors.sum()/m
print("total error:",errorRate,"\n")
#错误率为0.0退出循环
if errorRate==0.0:break
#返回弱分类器的组合列表
return weakClassArr
\ No newline at end of file
#
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 13 12:21:58 2018
@author: Administrator
"""
from numpy import *
from Stump_classify import *
from Test_classify import *
from Train_adaboost import *
from ROC_plot import *
def loadSimpData():
dataMat=matrix([[1. ,2.1],
[2. ,1.1],
[1.3,1. ],
[1. ,1. ],
[2. ,1. ]])
classLabels=[1.0,1.0,-1.0,-1.0,1.0]
return dataMat,classLabels
if __name__ == '__main__':
data, label = loadSimpData()
datArr,labelArr = loadDataSet('horseColicTraining.txt')
classifierArr = adaBoostTrainDS(datArr,labelArr,9)
testArr,testLabelArr = loadDataSet('horseColicTest.txt')
prediciton = adaClassify(testArr,classifierArr)
error = mat(ones((67,1)))
error[prediciton != mat(testLabelArr ).T] .sum()
\ No newline at end of file
2 1 38.50 54 20 0 1 2 2 3 4 1 2 2 5.90 0 2 42.00 6.30 0 0 1
2 1 37.60 48 36 0 0 1 1 0 3 0 0 0 0 0 0 44.00 6.30 1 5.00 1
1 1 37.7 44 28 0 4 3 2 5 4 4 1 1 0 3 5 45 70 3 2 1
1 1 37 56 24 3 1 4 2 4 4 3 1 1 0 0 0 35 61 3 2 0
2 1 38.00 42 12 3 0 3 1 1 0 1 0 0 0 0 2 37.00 5.80 0 0 1
1 1 0 60 40 3 0 1 1 0 4 0 3 2 0 0 5 42 72 0 0 1
2 1 38.40 80 60 3 2 2 1 3 2 1 2 2 0 1 1 54.00 6.90 0 0 1
2 1 37.80 48 12 2 1 2 1 3 0 1 2 0 0 2 0 48.00 7.30 1 0 1
2 1 37.90 45 36 3 3 3 2 2 3 1 2 1 0 3 0 33.00 5.70 3 0 1
2 1 39.00 84 12 3 1 5 1 2 4 2 1 2 7.00 0 4 62.00 5.90 2 2.20 0
2 1 38.20 60 24 3 1 3 2 3 3 2 3 3 0 4 4 53.00 7.50 2 1.40 1
1 1 0 140 0 0 0 4 2 5 4 4 1 1 0 0 5 30 69 0 0 0
1 1 37.90 120 60 3 3 3 1 5 4 4 2 2 7.50 4 5 52.00 6.60 3 1.80 0
2 1 38.00 72 36 1 1 3 1 3 0 2 2 1 0 3 5 38.00 6.80 2 2.00 1
2 9 38.00 92 28 1 1 2 1 1 3 2 3 0 7.20 0 0 37.00 6.10 1 1.10 1
1 1 38.30 66 30 2 3 1 1 2 4 3 3 2 8.50 4 5 37.00 6.00 0 0 1
2 1 37.50 48 24 3 1 1 1 2 1 0 1 1 0 3 2 43.00 6.00 1 2.80 1
1 1 37.50 88 20 2 3 3 1 4 3 3 0 0 0 0 0 35.00 6.40 1 0 0
2 9 0 150 60 4 4 4 2 5 4 4 0 0 0 0 0 0 0 0 0 0
1 1 39.7 100 30 0 0 6 2 4 4 3 1 0 0 4 5 65 75 0 0 0
1 1 38.30 80 0 3 3 4 2 5 4 3 2 1 0 4 4 45.00 7.50 2 4.60 1
2 1 37.50 40 32 3 1 3 1 3 2 3 2 1 0 0 5 32.00 6.40 1 1.10 1
1 1 38.40 84 30 3 1 5 2 4 3 3 2 3 6.50 4 4 47.00 7.50 3 0 0
1 1 38.10 84 44 4 0 4 2 5 3 1 1 3 5.00 0 4 60.00 6.80 0 5.70 0
2 1 38.70 52 0 1 1 1 1 1 3 1 0 0 0 1 3 4.00 74.00 0 0 1
2 1 38.10 44 40 2 1 3 1 3 3 1 0 0 0 1 3 35.00 6.80 0 0 1
2 1 38.4 52 20 2 1 3 1 1 3 2 2 1 0 3 5 41 63 1 1 1
1 1 38.20 60 0 1 0 3 1 2 1 1 1 1 0 4 4 43.00 6.20 2 3.90 1
2 1 37.70 40 18 1 1 1 0 3 2 1 1 1 0 3 3 36.00 3.50 0 0 1
1 1 39.1 60 10 0 1 1 0 2 3 0 0 0 0 4 4 0 0 0 0 1
2 1 37.80 48 16 1 1 1 1 0 1 1 2 1 0 4 3 43.00 7.50 0 0 1
1 1 39.00 120 0 4 3 5 2 2 4 3 2 3 8.00 0 0 65.00 8.20 3 4.60 1
1 1 38.20 76 0 2 3 2 1 5 3 3 1 2 6.00 1 5 35.00 6.50 2 0.90 1
2 1 38.30 88 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 38.00 80 30 3 3 3 1 0 0 0 0 0 6.00 0 0 48.00 8.30 0 4.30 1
1 1 0 0 0 3 1 1 1 2 3 3 1 3 6.00 4 4 0 0 2 0 0
1 1 37.60 40 0 1 1 1 1 1 1 1 0 0 0 1 1 0 0 2 2.10 1
2 1 37.50 44 0 1 1 1 1 3 3 2 0 0 0 0 0 45.00 5.80 2 1.40 1
2 1 38.2 42 16 1 1 3 1 1 3 1 0 0 0 1 0 35 60 1 1 1
2 1 38 56 44 3 3 3 0 0 1 1 2 1 0 4 0 47 70 2 1 1
2 1 38.30 45 20 3 3 2 2 2 4 1 2 0 0 4 0 0 0 0 0 1
1 1 0 48 96 1 1 3 1 0 4 1 2 1 0 1 4 42.00 8.00 1 0 1
1 1 37.70 55 28 2 1 2 1 2 3 3 0 3 5.00 4 5 0 0 0 0 1
2 1 36.00 100 20 4 3 6 2 2 4 3 1 1 0 4 5 74.00 5.70 2 2.50 0
1 1 37.10 60 20 2 0 4 1 3 0 3 0 2 5.00 3 4 64.00 8.50 2 0 1
2 1 37.10 114 40 3 0 3 2 2 2 1 0 0 0 0 3 32.00 0 3 6.50 1
1 1 38.1 72 30 3 3 3 1 4 4 3 2 1 0 3 5 37 56 3 1 1
1 1 37.00 44 12 3 1 1 2 1 1 1 0 0 0 4 2 40.00 6.70 3 8.00 1
1 1 38.6 48 20 3 1 1 1 4 3 1 0 0 0 3 0 37 75 0 0 1
1 1 0 82 72 3 1 4 1 2 3 3 0 3 0 4 4 53 65 3 2 0
1 9 38.20 78 60 4 4 6 0 3 3 3 0 0 0 1 0 59.00 5.80 3 3.10 0
2 1 37.8 60 16 1 1 3 1 2 3 2 1 2 0 3 0 41 73 0 0 0
1 1 38.7 34 30 2 0 3 1 2 3 0 0 0 0 0 0 33 69 0 2 0
1 1 0 36 12 1 1 1 1 1 2 1 1 1 0 1 5 44.00 0 0 0 1
2 1 38.30 44 60 0 0 1 1 0 0 0 0 0 0 0 0 6.40 36.00 0 0 1
2 1 37.40 54 18 3 0 1 1 3 4 3 2 2 0 4 5 30.00 7.10 2 0 1
1 1 0 0 0 4 3 0 2 2 4 1 0 0 0 0 0 54 76 3 2 1
1 1 36.6 48 16 3 1 3 1 4 1 1 1 1 0 0 0 27 56 0 0 0
1 1 38.5 90 0 1 1 3 1 3 3 3 2 3 2 4 5 47 79 0 0 1
1 1 0 75 12 1 1 4 1 5 3 3 0 3 5.80 0 0 58.00 8.50 1 0 1
2 1 38.20 42 0 3 1 1 1 1 1 2 2 1 0 3 2 35.00 5.90 2 0 1
1 9 38.20 78 60 4 4 6 0 3 3 3 0 0 0 1 0 59.00 5.80 3 3.10 0
2 1 38.60 60 30 1 1 3 1 4 2 2 1 1 0 0 0 40.00 6.00 1 0 1
2 1 37.80 42 40 1 1 1 1 1 3 1 0 0 0 3 3 36.00 6.20 0 0 1
1 1 38 60 12 1 1 2 1 2 1 1 1 1 0 1 4 44 65 3 2 0
2 1 38.00 42 12 3 0 3 1 1 1 1 0 0 0 0 1 37.00 5.80 0 0 1
2 1 37.60 88 36 3 1 1 1 3 3 2 1 3 1.50 0 0 44.00 6.00 0 0 0
\ No newline at end of file
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册