决策树算法

35e56e90 · hathackerwang · 3202a0c6 · 35e56e90 · 35e56e90 · 35e56e90
13 changed file
--- a/ch02-Decision-Tree/Cal_Entropy.py
+++ b/ch02-Decision-Tree/Cal_Entropy.py
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul  4 13:06:48 2018
+
+@author: Administrator
+"""
+
+from math import log
+from numpy import *
+
+def cal_entropy(data):
+    '''计算样本实例的熵'''
+    entries_num = len(data)
+    label_count = {} #字典存储每个类别出现的次数
+ 
+    for vec in data:
+        cur_label = vec[-1] 
+    # 将样本标签提取出来，并计数
+        label_count[cur_label] = label_count.get(cur_label,0) + 1
+    Entropy = 0.0
+    # 对每一个类别，计算样本中取到该类的概率
+    # 最后将概率带入，求出熵
+    for key in label_count:
+        prob = float(label_count[key]) / entries_num
+        Entropy += prob * math.log(prob, 2) #此处使用numpy.math
+    return (0-Entropy)
+
+
+        
+        
\ No newline at end of file
--- a/ch02-Decision-Tree/Classify_tree.py
+++ b/ch02-Decision-Tree/Classify_tree.py
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul  4 21:44:42 2018
+
+@author: Administrator
+"""
+
+def classify(inp_tree, labels, test_vec):
+    first_node = list(inp_tree.keys())[0]
+    second_dict = inp_tree[first_node]
+    index = labels.index(first_node)
+    
+    for key in second_dict.keys():
+        if test_vec[index] == key:
+            if type(second_dict[key]).__name__ == 'dict':
+                class_label = classify(second_dict[key], labels, test_vec)
+            else:   class_label = second_dict[key]
+    return class_label
+
+def store_tree(inp_tree, filename):
+    import pickle
+    with open(filename,'w') as fp:
+        pickle.dump(inp_tree, fp)
+    
+def grab_tree(filename):
+    import pickle
+    fr = open(filename)
+    return pickle.load(fr)
--- a/ch02-Decision-Tree/Decision_Tree.py
+++ b/ch02-Decision-Tree/Decision_Tree.py
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul  4 16:42:04 2018
+
+@author: Administrator
+"""
+import operator
+from Split_by_entropy import *
+
+def Majority_vote(classList):
+    '''
+    使用多数表决法：若集合中属于第K类的节点最多，则此分支集合
+            划分为第K类
+    '''
+    classcount = {}
+    for vote in classList:
+        classcount[vote] = classcount.get(vote,0) + 1
+    sorted_count = sorted(classcount.items(), key = operator.itemgetter(1),\
+                          reverse = True)
+    # 获取每一类出现的节点数（没出现默认为0）并进行排序
+    # 返回最大项的KEY所对应的类别
+    return sorted_count[0][0]
+
+def Create_Tree(dataset,labels):
+
+    classList = [x[-1] for x in dataset]
+    if classList.count(classList[0]) == len(classList):
+        return classList[0]
+    #
+    if len(dataset[0]) == 1:
+        return Majority_vote(classList)
+    
+    best_feature = Split_by_entropy(dataset)
+    best_labels = labels[best_feature]
+    
+    myTree = {best_labels:{}}
+    # 此位置书上写的有误，书上为del(labels[bestFeat])
+    # 相当于操作原始列表内容，导致原始列表内容发生改变
+    # 按此运行程序，报错'no surfacing'is not in list
+    # 以下代码已改正
+    
+    # 复制当前特征标签列表，防止改变原始列表的内容
+    subLabels=labels[:]
+    # 删除属性列表中当前分类数据集特征
+    del(subLabels[best_feature])
+
+    # 使用列表推导式生成该特征对应的列
+    f_val = [x[best_feature] for x in dataset]
+    uni_val = set(f_val)
+    for value in uni_val:
+        # 递归创建子树并返回
+        myTree[best_labels][value] = Create_Tree(Split_Data(dataset\
+              ,best_feature,value), subLabels)
+    
+    return myTree
+
+
+
--- a/ch02-Decision-Tree/Plot_tree.py
+++ b/ch02-Decision-Tree/Plot_tree.py
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul  4 21:10:10 2018
+
+@author: Administrator
+"""
+
+import matplotlib.pyplot as plt
+
+# 定义文本框和箭头格式
+decisionNode = dict(boxstyle="sawtooth", fc="0.8")
+leafNode = dict(boxstyle="round4", fc="0.8")
+arrow_args = dict(arrowstyle="<-")
+ 
+# 绘制带箭头的注释
+def plotNode(nodeTxt, centerPt, parentPt, nodeType):
+     createPlot.ax1.annotate(nodeTxt, xy=parentPt,  xycoords='axes fraction',
+              xytext=centerPt, textcoords='axes fraction',
+              va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )
+ 
+     
+def Num_of_leaf(myTree):
+    '''计算此树的叶子节点数目'''
+    num_leaf = 0
+    first_node = myTree.keys()
+    first_node = list(first_node)[0]
+    second_dict = myTree[first_node]
+    # Python3中使用LIST转换firstnode，原书使用[0]直接索引只能用于Python2
+    # 对于树，每次判断value是否为字典，若为字典则进行递归，否则累加器+1
+    for key in second_dict.keys():
+        if type(second_dict[key]).__name__ =='dict':
+            num_leaf += Num_of_leaf(second_dict[key])
+        else: num_leaf += 1
+    return num_leaf
+
+def Depth_of_tree(myTree):
+    '''计算此树的总深度'''
+    depth = 0
+    first_node = myTree.keys()
+    first_node = list(first_node)[0]
+    second_dict = myTree[first_node]
+    
+    for key in second_dict.keys():
+        if type(second_dict[key]).__name__ =='dict':
+            pri_depth = 1 + Depth_of_tree(second_dict[key])
+        else: pri_depth = 1
+        # 对于树，每次判断value是否为字典，若为字典则进行递归，否则计数器+1
+        if pri_depth > depth: depth = pri_depth
+    return depth
+
+def retrieveTree(i):
+    '''
+   保存了树的测试数据
+     '''
+    listOfTrees =[{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', \
+                                1: 'yes'}}}},{'no surfacing': {0: 'no', \
+    1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
+                  ]
+    return listOfTrees[i]
+
+def plotmidtext(cntrpt,parentpt,txtstring):
+    '''作用是计算tree的中间位置    
+    cntrpt起始位置,parentpt终止位置,txtstring：文本标签信息
+    '''
+    xmid=(parentpt[0]-cntrpt[0])/2.0+cntrpt[0]
+    # cntrPt 起点坐标 子节点坐标   
+    # parentPt 结束坐标 父节点坐标
+    ymid=(parentpt[1]-cntrpt[1])/2.0+cntrpt[1] # 找到x和y的中间位置
+    createPlot.ax1.text(xmid,ymid,txtstring)
+    
+    
+def plottree(mytree,parentpt,nodetxt):
+    numleafs=Num_of_leaf(mytree)
+    depth=Depth_of_tree(mytree)
+    firststr=list(mytree.keys())[0]
+    cntrpt=(plottree.xoff+(1.0+float(numleafs))/2.0/plottree.totalw,plottree.yoff)
+    # 计算子节点的坐标 
+    plotmidtext(cntrpt,parentpt,nodetxt) #绘制线上的文字  
+    plotNode(firststr,cntrpt,parentpt,decisionNode)#绘制节点  
+    seconddict=mytree[firststr]
+    plottree.yoff=plottree.yoff-1.0/plottree.totald
+    # 每绘制一次图，将y的坐标减少1.0/plottree.totald，间接保证y坐标上深度的
+    for key in seconddict.keys():
+        if type(seconddict[key]).__name__=='dict':
+            plottree(seconddict[key],cntrpt,str(key))
+        else:
+            plottree.xoff=plottree.xoff+1.0/plottree.totalw
+            plotNode(seconddict[key],(plottree.xoff,plottree.yoff),cntrpt,leafNode)
+            plotmidtext((plottree.xoff,plottree.yoff),cntrpt,str(key))
+    plottree.yoff=plottree.yoff+1.0/plottree.totald
+ 
+    
+def createPlot(intree):
+    # 类似于Matlab的figure，定义一个画布(暂且这么称呼吧)，背景为白色 
+    fig=plt.figure(1,facecolor='white')
+    fig.clf()    # 把画布清空 
+    axprops=dict(xticks=[],yticks=[])   
+    # createPlot.ax1为全局变量，绘制图像的句柄，subplot为定义了一个绘图，
+    # 111表示figure中的图有1行1列，即1个，最后的1代表第一个图 
+    # frameon表示是否绘制坐标轴矩形 
+    createPlot.ax1=plt.subplot(111,frameon=False,**axprops) 
+    
+    plottree.totalw=float(Num_of_leaf(intree))
+    plottree.totald=float(Depth_of_tree(intree))
+    plottree.xoff=-0.6/plottree.totalw;plottree.yoff=1.2;
+    plottree(intree,(0.5,1.0),'')
+    plt.show()
--- a/ch02-Decision-Tree/Split_by_entropy.py
+++ b/ch02-Decision-Tree/Split_by_entropy.py
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul  4 13:35:15 2018
+
+@author: Administrator
+"""
+
+from Cal_Entropy import *
+
+def Split_Data(dataset, axis, value):
+    '''
+    使用传入的axis以及value划分数据集
+    axis代表在每个列表中的第X位，value为用来划分的特征值
+    '''
+    new_subset = []
+    # 利用循环将不符合value的特征值划分入另一集合
+    # 相当于将value单独提取出来（或作为叶节点）
+    for vec in dataset:
+        if vec[axis] == value:
+            feature_split = vec[:axis]
+            feature_split.extend(vec[axis + 1:])
+            new_subset.append(feature_split)
+    # extend将VEC中的元素一一纳入feature_split
+    # append则将feature_split作为列表结合进目标集合
+            
+    return new_subset
+
+def Split_by_entropy(dataset):
+    '''
+    使用熵原则进行数据集划分
+    @信息增益:info_gain = old -new
+    @最优特征：best_feature
+    @类别集合：uniVal
+    '''
+    feature_num = len(dataset[0]) - 1
+    ent_old = cal_entropy(dataset)
+    best_gain = 0.0
+    best_feature = -1
+    # ENT_OLD代表划分前集合的熵，ENT_NEW代表划分后的熵
+    # best_gain将在迭代每一次特征的时候更新，最终选出最优特征
+    for i in range(feature_num):
+        feature_list = [x[i] for x in dataset]
+        uniVal = set(feature_list)
+        ent_new = 0.0
+        # 使用set剔除重复项，保留该特征对应的不同取值
+        for value in uniVal:
+            sub_set = Split_Data(dataset, i, value)
+            prob = len(sub_set) / float(len(dataset))
+            # 使用熵计算函数求出划分后的熵值
+            ent_new += prob * (0 - cal_entropy(sub_set))
+        
+        # 由ent_old - ent_new选出划分对应的最优特征
+        Info_gain = ent_old - ent_new
+        if(Info_gain > best_gain):
+            best_gain = Info_gain
+            best_feature = i
+            
+    return best_feature
\ No newline at end of file
--- a/ch02-Decision-Tree/__init__.py
+++ b/ch02-Decision-Tree/__init__.py
+#
\ No newline at end of file
--- a/ch02-Decision-Tree/__main__.py
+++ b/ch02-Decision-Tree/__main__.py
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul  4 13:23:05 2018
+
+@author: Administrator
+"""
+
+from numpy import *
+from Cal_Entropy import *
+from Split_by_entropy import *
+from Decision_Tree import *
+from Plot_tree import *
+from Classify_tree import *
+
+def create_data():
+    dataSet = [[1,1,'yes'],
+               [1,1,'yes'],
+               [1,0,'no'],
+               [0,1,'no'],
+               [0,1,'no']]
+    labels = ['no surfacing', 'flippers']
+    return dataSet, labels
+
+if __name__ == '__main__':
+    myData, labels = create_data()
+    print(myData)
+    print(cal_entropy(myData))
+    
+    print(Split_Data(myData,0,1))
+    print(Split_by_entropy(myData))
+    
+    mytree = Create_Tree(myData, labels)
+    print(mytree)
+    
+    myTree = retrieveTree(0)
+    print(Num_of_leaf(myTree), Depth_of_tree(myTree))
+    myTree['no surfacing'][3] = 'maybe'
+    createPlot(myTree)
+
+    with open('lenses.txt') as fp:
+        lenses = [line.strip().split('\t') for line in fp.readlines()]
+        lensesLabels=['age','prescript','astigmatic','tearRate']
+    
+    lense_Tree = Create_Tree(lenses, lensesLabels)
+    #createPlot(lense_Tree)
+    print(classify(lense_Tree, lensesLabels, ['young','hyper','yes','reducedno']))
--- a/ch02-Decision-Tree/__pycache__/Cal_Entropy.cpython-36.pyc
+++ b/ch02-Decision-Tree/__pycache__/Cal_Entropy.cpython-36.pyc
--- a/ch02-Decision-Tree/__pycache__/Classify_tree.cpython-36.pyc
+++ b/ch02-Decision-Tree/__pycache__/Classify_tree.cpython-36.pyc
--- a/ch02-Decision-Tree/__pycache__/Decision_Tree.cpython-36.pyc
+++ b/ch02-Decision-Tree/__pycache__/Decision_Tree.cpython-36.pyc
--- a/ch02-Decision-Tree/__pycache__/Plot_tree.cpython-36.pyc
+++ b/ch02-Decision-Tree/__pycache__/Plot_tree.cpython-36.pyc
--- a/ch02-Decision-Tree/__pycache__/Split_by_entropy.cpython-36.pyc
+++ b/ch02-Decision-Tree/__pycache__/Split_by_entropy.cpython-36.pyc
--- a/ch02-Decision-Tree/lenses.txt
+++ b/ch02-Decision-Tree/lenses.txt
+young	myope	no	reduced	no lenses
+young	myope	no	normal	soft
+young	myope	yes	reduced	no lenses
+young	myope	yes	normal	hard
+young	hyper	no	reduced	no lenses
+young	hyper	no	normal	soft
+young	hyper	yes	reduced	no lenses
+young	hyper	yes	normal	hard
+pre	myope	no	reduced	no lenses
+pre	myope	no	normal	soft
+pre	myope	yes	reduced	no lenses
+pre	myope	yes	normal	hard
+pre	hyper	no	reduced	no lenses
+pre	hyper	no	normal	soft
+pre	hyper	yes	reduced	no lenses
+pre	hyper	yes	normal	no lenses
+presbyopic	myope	no	reduced	no lenses
+presbyopic	myope	no	normal	no lenses
+presbyopic	myope	yes	reduced	no lenses
+presbyopic	myope	yes	normal	hard
+presbyopic	hyper	no	reduced	no lenses
+presbyopic	hyper	no	normal	soft
+presbyopic	hyper	yes	reduced	no lenses
+presbyopic	hyper	yes	normal	no lenses