提交 35e56e90 编写于 作者: H hathackerwang

决策树算法

上级 3202a0c6
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 4 13:06:48 2018
@author: Administrator
"""
from math import log
from numpy import *
def cal_entropy(data):
'''计算样本实例的熵'''
entries_num = len(data)
label_count = {} #字典存储每个类别出现的次数
for vec in data:
cur_label = vec[-1]
# 将样本标签提取出来,并计数
label_count[cur_label] = label_count.get(cur_label,0) + 1
Entropy = 0.0
# 对每一个类别,计算样本中取到该类的概率
# 最后将概率带入,求出熵
for key in label_count:
prob = float(label_count[key]) / entries_num
Entropy += prob * math.log(prob, 2) #此处使用numpy.math
return (0-Entropy)
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 4 21:44:42 2018
@author: Administrator
"""
def classify(inp_tree, labels, test_vec):
first_node = list(inp_tree.keys())[0]
second_dict = inp_tree[first_node]
index = labels.index(first_node)
for key in second_dict.keys():
if test_vec[index] == key:
if type(second_dict[key]).__name__ == 'dict':
class_label = classify(second_dict[key], labels, test_vec)
else: class_label = second_dict[key]
return class_label
def store_tree(inp_tree, filename):
import pickle
with open(filename,'w') as fp:
pickle.dump(inp_tree, fp)
def grab_tree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 4 16:42:04 2018
@author: Administrator
"""
import operator
from Split_by_entropy import *
def Majority_vote(classList):
'''
使用多数表决法:若集合中属于第K类的节点最多,则此分支集合
划分为第K类
'''
classcount = {}
for vote in classList:
classcount[vote] = classcount.get(vote,0) + 1
sorted_count = sorted(classcount.items(), key = operator.itemgetter(1),\
reverse = True)
# 获取每一类出现的节点数(没出现默认为0)并进行排序
# 返回最大项的KEY所对应的类别
return sorted_count[0][0]
def Create_Tree(dataset,labels):
classList = [x[-1] for x in dataset]
if classList.count(classList[0]) == len(classList):
return classList[0]
#
if len(dataset[0]) == 1:
return Majority_vote(classList)
best_feature = Split_by_entropy(dataset)
best_labels = labels[best_feature]
myTree = {best_labels:{}}
# 此位置书上写的有误,书上为del(labels[bestFeat])
# 相当于操作原始列表内容,导致原始列表内容发生改变
# 按此运行程序,报错'no surfacing'is not in list
# 以下代码已改正
# 复制当前特征标签列表,防止改变原始列表的内容
subLabels=labels[:]
# 删除属性列表中当前分类数据集特征
del(subLabels[best_feature])
# 使用列表推导式生成该特征对应的列
f_val = [x[best_feature] for x in dataset]
uni_val = set(f_val)
for value in uni_val:
# 递归创建子树并返回
myTree[best_labels][value] = Create_Tree(Split_Data(dataset\
,best_feature,value), subLabels)
return myTree
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 4 21:10:10 2018
@author: Administrator
"""
import matplotlib.pyplot as plt
# 定义文本框和箭头格式
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
# 绘制带箭头的注释
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',
xytext=centerPt, textcoords='axes fraction',
va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )
def Num_of_leaf(myTree):
'''计算此树的叶子节点数目'''
num_leaf = 0
first_node = myTree.keys()
first_node = list(first_node)[0]
second_dict = myTree[first_node]
# Python3中使用LIST转换firstnode,原书使用[0]直接索引只能用于Python2
# 对于树,每次判断value是否为字典,若为字典则进行递归,否则累加器+1
for key in second_dict.keys():
if type(second_dict[key]).__name__ =='dict':
num_leaf += Num_of_leaf(second_dict[key])
else: num_leaf += 1
return num_leaf
def Depth_of_tree(myTree):
'''计算此树的总深度'''
depth = 0
first_node = myTree.keys()
first_node = list(first_node)[0]
second_dict = myTree[first_node]
for key in second_dict.keys():
if type(second_dict[key]).__name__ =='dict':
pri_depth = 1 + Depth_of_tree(second_dict[key])
else: pri_depth = 1
# 对于树,每次判断value是否为字典,若为字典则进行递归,否则计数器+1
if pri_depth > depth: depth = pri_depth
return depth
def retrieveTree(i):
'''
保存了树的测试数据
'''
listOfTrees =[{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', \
1: 'yes'}}}},{'no surfacing': {0: 'no', \
1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
]
return listOfTrees[i]
def plotmidtext(cntrpt,parentpt,txtstring):
'''作用是计算tree的中间位置
cntrpt起始位置,parentpt终止位置,txtstring:文本标签信息
'''
xmid=(parentpt[0]-cntrpt[0])/2.0+cntrpt[0]
# cntrPt 起点坐标 子节点坐标
# parentPt 结束坐标 父节点坐标
ymid=(parentpt[1]-cntrpt[1])/2.0+cntrpt[1] # 找到x和y的中间位置
createPlot.ax1.text(xmid,ymid,txtstring)
def plottree(mytree,parentpt,nodetxt):
numleafs=Num_of_leaf(mytree)
depth=Depth_of_tree(mytree)
firststr=list(mytree.keys())[0]
cntrpt=(plottree.xoff+(1.0+float(numleafs))/2.0/plottree.totalw,plottree.yoff)
# 计算子节点的坐标
plotmidtext(cntrpt,parentpt,nodetxt) #绘制线上的文字
plotNode(firststr,cntrpt,parentpt,decisionNode)#绘制节点
seconddict=mytree[firststr]
plottree.yoff=plottree.yoff-1.0/plottree.totald
# 每绘制一次图,将y的坐标减少1.0/plottree.totald,间接保证y坐标上深度的
for key in seconddict.keys():
if type(seconddict[key]).__name__=='dict':
plottree(seconddict[key],cntrpt,str(key))
else:
plottree.xoff=plottree.xoff+1.0/plottree.totalw
plotNode(seconddict[key],(plottree.xoff,plottree.yoff),cntrpt,leafNode)
plotmidtext((plottree.xoff,plottree.yoff),cntrpt,str(key))
plottree.yoff=plottree.yoff+1.0/plottree.totald
def createPlot(intree):
# 类似于Matlab的figure,定义一个画布(暂且这么称呼吧),背景为白色
fig=plt.figure(1,facecolor='white')
fig.clf() # 把画布清空
axprops=dict(xticks=[],yticks=[])
# createPlot.ax1为全局变量,绘制图像的句柄,subplot为定义了一个绘图,
# 111表示figure中的图有1行1列,即1个,最后的1代表第一个图
# frameon表示是否绘制坐标轴矩形
createPlot.ax1=plt.subplot(111,frameon=False,**axprops)
plottree.totalw=float(Num_of_leaf(intree))
plottree.totald=float(Depth_of_tree(intree))
plottree.xoff=-0.6/plottree.totalw;plottree.yoff=1.2;
plottree(intree,(0.5,1.0),'')
plt.show()
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 4 13:35:15 2018
@author: Administrator
"""
from Cal_Entropy import *
def Split_Data(dataset, axis, value):
'''
使用传入的axis以及value划分数据集
axis代表在每个列表中的第X位,value为用来划分的特征值
'''
new_subset = []
# 利用循环将不符合value的特征值划分入另一集合
# 相当于将value单独提取出来(或作为叶节点)
for vec in dataset:
if vec[axis] == value:
feature_split = vec[:axis]
feature_split.extend(vec[axis + 1:])
new_subset.append(feature_split)
# extend将VEC中的元素一一纳入feature_split
# append则将feature_split作为列表结合进目标集合
return new_subset
def Split_by_entropy(dataset):
'''
使用熵原则进行数据集划分
@信息增益:info_gain = old -new
@最优特征:best_feature
@类别集合:uniVal
'''
feature_num = len(dataset[0]) - 1
ent_old = cal_entropy(dataset)
best_gain = 0.0
best_feature = -1
# ENT_OLD代表划分前集合的熵,ENT_NEW代表划分后的熵
# best_gain将在迭代每一次特征的时候更新,最终选出最优特征
for i in range(feature_num):
feature_list = [x[i] for x in dataset]
uniVal = set(feature_list)
ent_new = 0.0
# 使用set剔除重复项,保留该特征对应的不同取值
for value in uniVal:
sub_set = Split_Data(dataset, i, value)
prob = len(sub_set) / float(len(dataset))
# 使用熵计算函数求出划分后的熵值
ent_new += prob * (0 - cal_entropy(sub_set))
# 由ent_old - ent_new选出划分对应的最优特征
Info_gain = ent_old - ent_new
if(Info_gain > best_gain):
best_gain = Info_gain
best_feature = i
return best_feature
\ No newline at end of file
#
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 4 13:23:05 2018
@author: Administrator
"""
from numpy import *
from Cal_Entropy import *
from Split_by_entropy import *
from Decision_Tree import *
from Plot_tree import *
from Classify_tree import *
def create_data():
dataSet = [[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
labels = ['no surfacing', 'flippers']
return dataSet, labels
if __name__ == '__main__':
myData, labels = create_data()
print(myData)
print(cal_entropy(myData))
print(Split_Data(myData,0,1))
print(Split_by_entropy(myData))
mytree = Create_Tree(myData, labels)
print(mytree)
myTree = retrieveTree(0)
print(Num_of_leaf(myTree), Depth_of_tree(myTree))
myTree['no surfacing'][3] = 'maybe'
createPlot(myTree)
with open('lenses.txt') as fp:
lenses = [line.strip().split('\t') for line in fp.readlines()]
lensesLabels=['age','prescript','astigmatic','tearRate']
lense_Tree = Create_Tree(lenses, lensesLabels)
#createPlot(lense_Tree)
print(classify(lense_Tree, lensesLabels, ['young','hyper','yes','reducedno']))
young myope no reduced no lenses
young myope no normal soft
young myope yes reduced no lenses
young myope yes normal hard
young hyper no reduced no lenses
young hyper no normal soft
young hyper yes reduced no lenses
young hyper yes normal hard
pre myope no reduced no lenses
pre myope no normal soft
pre myope yes reduced no lenses
pre myope yes normal hard
pre hyper no reduced no lenses
pre hyper no normal soft
pre hyper yes reduced no lenses
pre hyper yes normal no lenses
presbyopic myope no reduced no lenses
presbyopic myope no normal no lenses
presbyopic myope yes reduced no lenses
presbyopic myope yes normal hard
presbyopic hyper no reduced no lenses
presbyopic hyper no normal soft
presbyopic hyper yes reduced no lenses
presbyopic hyper yes normal no lenses
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册