diff --git a/ch02-Decision-Tree/Cal_Entropy.py b/ch02-Decision-Tree/Cal_Entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..56d9dace63332705395832fbd7a5de8f39016731 --- /dev/null +++ b/ch02-Decision-Tree/Cal_Entropy.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jul 4 13:06:48 2018 + +@author: Administrator +""" + +from math import log +from numpy import * + +def cal_entropy(data): + '''计算样本实例的熵''' + entries_num = len(data) + label_count = {} #字典存储每个类别出现的次数 + + for vec in data: + cur_label = vec[-1] + # 将样本标签提取出来,并计数 + label_count[cur_label] = label_count.get(cur_label,0) + 1 + Entropy = 0.0 + # 对每一个类别,计算样本中取到该类的概率 + # 最后将概率带入,求出熵 + for key in label_count: + prob = float(label_count[key]) / entries_num + Entropy += prob * math.log(prob, 2) #此处使用numpy.math + return (0-Entropy) + + + + \ No newline at end of file diff --git a/ch02-Decision-Tree/Classify_tree.py b/ch02-Decision-Tree/Classify_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..540ccc612d6daa327c67f8a86d9e0cf94dba6e89 --- /dev/null +++ b/ch02-Decision-Tree/Classify_tree.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jul 4 21:44:42 2018 + +@author: Administrator +""" + +def classify(inp_tree, labels, test_vec): + first_node = list(inp_tree.keys())[0] + second_dict = inp_tree[first_node] + index = labels.index(first_node) + + for key in second_dict.keys(): + if test_vec[index] == key: + if type(second_dict[key]).__name__ == 'dict': + class_label = classify(second_dict[key], labels, test_vec) + else: class_label = second_dict[key] + return class_label + +def store_tree(inp_tree, filename): + import pickle + with open(filename,'w') as fp: + pickle.dump(inp_tree, fp) + +def grab_tree(filename): + import pickle + fr = open(filename) + return pickle.load(fr) diff --git a/ch02-Decision-Tree/Decision_Tree.py b/ch02-Decision-Tree/Decision_Tree.py new file mode 100644 index 0000000000000000000000000000000000000000..4721ffe5b451e95cd573414c452aca5f70cdbde4 --- /dev/null +++ b/ch02-Decision-Tree/Decision_Tree.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jul 4 16:42:04 2018 + +@author: Administrator +""" +import operator +from Split_by_entropy import * + +def Majority_vote(classList): + ''' + 使用多数表决法:若集合中属于第K类的节点最多,则此分支集合 + 划分为第K类 + ''' + classcount = {} + for vote in classList: + classcount[vote] = classcount.get(vote,0) + 1 + sorted_count = sorted(classcount.items(), key = operator.itemgetter(1),\ + reverse = True) + # 获取每一类出现的节点数(没出现默认为0)并进行排序 + # 返回最大项的KEY所对应的类别 + return sorted_count[0][0] + +def Create_Tree(dataset,labels): + + classList = [x[-1] for x in dataset] + if classList.count(classList[0]) == len(classList): + return classList[0] + # + if len(dataset[0]) == 1: + return Majority_vote(classList) + + best_feature = Split_by_entropy(dataset) + best_labels = labels[best_feature] + + myTree = {best_labels:{}} + # 此位置书上写的有误,书上为del(labels[bestFeat]) + # 相当于操作原始列表内容,导致原始列表内容发生改变 + # 按此运行程序,报错'no surfacing'is not in list + # 以下代码已改正 + + # 复制当前特征标签列表,防止改变原始列表的内容 + subLabels=labels[:] + # 删除属性列表中当前分类数据集特征 + del(subLabels[best_feature]) + + # 使用列表推导式生成该特征对应的列 + f_val = [x[best_feature] for x in dataset] + uni_val = set(f_val) + for value in uni_val: + # 递归创建子树并返回 + myTree[best_labels][value] = Create_Tree(Split_Data(dataset\ + ,best_feature,value), subLabels) + + return myTree + + + diff --git a/ch02-Decision-Tree/Plot_tree.py b/ch02-Decision-Tree/Plot_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..03e9afccea199fcbbd419b548eb850410a36ec7e --- /dev/null +++ b/ch02-Decision-Tree/Plot_tree.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jul 4 21:10:10 2018 + +@author: Administrator +""" + +import matplotlib.pyplot as plt + +# 定义文本框和箭头格式 +decisionNode = dict(boxstyle="sawtooth", fc="0.8") +leafNode = dict(boxstyle="round4", fc="0.8") +arrow_args = dict(arrowstyle="<-") + +# 绘制带箭头的注释 +def plotNode(nodeTxt, centerPt, parentPt, nodeType): + createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', + xytext=centerPt, textcoords='axes fraction', + va="center", ha="center", bbox=nodeType, arrowprops=arrow_args ) + + +def Num_of_leaf(myTree): + '''计算此树的叶子节点数目''' + num_leaf = 0 + first_node = myTree.keys() + first_node = list(first_node)[0] + second_dict = myTree[first_node] + # Python3中使用LIST转换firstnode,原书使用[0]直接索引只能用于Python2 + # 对于树,每次判断value是否为字典,若为字典则进行递归,否则累加器+1 + for key in second_dict.keys(): + if type(second_dict[key]).__name__ =='dict': + num_leaf += Num_of_leaf(second_dict[key]) + else: num_leaf += 1 + return num_leaf + +def Depth_of_tree(myTree): + '''计算此树的总深度''' + depth = 0 + first_node = myTree.keys() + first_node = list(first_node)[0] + second_dict = myTree[first_node] + + for key in second_dict.keys(): + if type(second_dict[key]).__name__ =='dict': + pri_depth = 1 + Depth_of_tree(second_dict[key]) + else: pri_depth = 1 + # 对于树,每次判断value是否为字典,若为字典则进行递归,否则计数器+1 + if pri_depth > depth: depth = pri_depth + return depth + +def retrieveTree(i): + ''' + 保存了树的测试数据 + ''' + listOfTrees =[{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', \ + 1: 'yes'}}}},{'no surfacing': {0: 'no', \ + 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}} + ] + return listOfTrees[i] + +def plotmidtext(cntrpt,parentpt,txtstring): + '''作用是计算tree的中间位置 + cntrpt起始位置,parentpt终止位置,txtstring:文本标签信息 + ''' + xmid=(parentpt[0]-cntrpt[0])/2.0+cntrpt[0] + # cntrPt 起点坐标 子节点坐标 + # parentPt 结束坐标 父节点坐标 + ymid=(parentpt[1]-cntrpt[1])/2.0+cntrpt[1] # 找到x和y的中间位置 + createPlot.ax1.text(xmid,ymid,txtstring) + + +def plottree(mytree,parentpt,nodetxt): + numleafs=Num_of_leaf(mytree) + depth=Depth_of_tree(mytree) + firststr=list(mytree.keys())[0] + cntrpt=(plottree.xoff+(1.0+float(numleafs))/2.0/plottree.totalw,plottree.yoff) + # 计算子节点的坐标 + plotmidtext(cntrpt,parentpt,nodetxt) #绘制线上的文字 + plotNode(firststr,cntrpt,parentpt,decisionNode)#绘制节点 + seconddict=mytree[firststr] + plottree.yoff=plottree.yoff-1.0/plottree.totald + # 每绘制一次图,将y的坐标减少1.0/plottree.totald,间接保证y坐标上深度的 + for key in seconddict.keys(): + if type(seconddict[key]).__name__=='dict': + plottree(seconddict[key],cntrpt,str(key)) + else: + plottree.xoff=plottree.xoff+1.0/plottree.totalw + plotNode(seconddict[key],(plottree.xoff,plottree.yoff),cntrpt,leafNode) + plotmidtext((plottree.xoff,plottree.yoff),cntrpt,str(key)) + plottree.yoff=plottree.yoff+1.0/plottree.totald + + +def createPlot(intree): + # 类似于Matlab的figure,定义一个画布(暂且这么称呼吧),背景为白色 + fig=plt.figure(1,facecolor='white') + fig.clf() # 把画布清空 + axprops=dict(xticks=[],yticks=[]) + # createPlot.ax1为全局变量,绘制图像的句柄,subplot为定义了一个绘图, + # 111表示figure中的图有1行1列,即1个,最后的1代表第一个图 + # frameon表示是否绘制坐标轴矩形 + createPlot.ax1=plt.subplot(111,frameon=False,**axprops) + + plottree.totalw=float(Num_of_leaf(intree)) + plottree.totald=float(Depth_of_tree(intree)) + plottree.xoff=-0.6/plottree.totalw;plottree.yoff=1.2; + plottree(intree,(0.5,1.0),'') + plt.show() diff --git a/ch02-Decision-Tree/Split_by_entropy.py b/ch02-Decision-Tree/Split_by_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..2909335d30079d3f4786a724fc3cc6924599ada4 --- /dev/null +++ b/ch02-Decision-Tree/Split_by_entropy.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jul 4 13:35:15 2018 + +@author: Administrator +""" + +from Cal_Entropy import * + +def Split_Data(dataset, axis, value): + ''' + 使用传入的axis以及value划分数据集 + axis代表在每个列表中的第X位,value为用来划分的特征值 + ''' + new_subset = [] + # 利用循环将不符合value的特征值划分入另一集合 + # 相当于将value单独提取出来(或作为叶节点) + for vec in dataset: + if vec[axis] == value: + feature_split = vec[:axis] + feature_split.extend(vec[axis + 1:]) + new_subset.append(feature_split) + # extend将VEC中的元素一一纳入feature_split + # append则将feature_split作为列表结合进目标集合 + + return new_subset + +def Split_by_entropy(dataset): + ''' + 使用熵原则进行数据集划分 + @信息增益:info_gain = old -new + @最优特征:best_feature + @类别集合:uniVal + ''' + feature_num = len(dataset[0]) - 1 + ent_old = cal_entropy(dataset) + best_gain = 0.0 + best_feature = -1 + # ENT_OLD代表划分前集合的熵,ENT_NEW代表划分后的熵 + # best_gain将在迭代每一次特征的时候更新,最终选出最优特征 + for i in range(feature_num): + feature_list = [x[i] for x in dataset] + uniVal = set(feature_list) + ent_new = 0.0 + # 使用set剔除重复项,保留该特征对应的不同取值 + for value in uniVal: + sub_set = Split_Data(dataset, i, value) + prob = len(sub_set) / float(len(dataset)) + # 使用熵计算函数求出划分后的熵值 + ent_new += prob * (0 - cal_entropy(sub_set)) + + # 由ent_old - ent_new选出划分对应的最优特征 + Info_gain = ent_old - ent_new + if(Info_gain > best_gain): + best_gain = Info_gain + best_feature = i + + return best_feature \ No newline at end of file diff --git a/ch02-Decision-Tree/__init__.py b/ch02-Decision-Tree/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4287ca8617970fa8fc025b75cb319c7032706910 --- /dev/null +++ b/ch02-Decision-Tree/__init__.py @@ -0,0 +1 @@ +# \ No newline at end of file diff --git a/ch02-Decision-Tree/__main__.py b/ch02-Decision-Tree/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd2d663fb9c4416aba5be9db1be2a94aed1e981 --- /dev/null +++ b/ch02-Decision-Tree/__main__.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jul 4 13:23:05 2018 + +@author: Administrator +""" + +from numpy import * +from Cal_Entropy import * +from Split_by_entropy import * +from Decision_Tree import * +from Plot_tree import * +from Classify_tree import * + +def create_data(): + dataSet = [[1,1,'yes'], + [1,1,'yes'], + [1,0,'no'], + [0,1,'no'], + [0,1,'no']] + labels = ['no surfacing', 'flippers'] + return dataSet, labels + +if __name__ == '__main__': + myData, labels = create_data() + print(myData) + print(cal_entropy(myData)) + + print(Split_Data(myData,0,1)) + print(Split_by_entropy(myData)) + + mytree = Create_Tree(myData, labels) + print(mytree) + + myTree = retrieveTree(0) + print(Num_of_leaf(myTree), Depth_of_tree(myTree)) + myTree['no surfacing'][3] = 'maybe' + createPlot(myTree) + + with open('lenses.txt') as fp: + lenses = [line.strip().split('\t') for line in fp.readlines()] + lensesLabels=['age','prescript','astigmatic','tearRate'] + + lense_Tree = Create_Tree(lenses, lensesLabels) + #createPlot(lense_Tree) + print(classify(lense_Tree, lensesLabels, ['young','hyper','yes','reducedno'])) diff --git a/ch02-Decision-Tree/__pycache__/Cal_Entropy.cpython-36.pyc b/ch02-Decision-Tree/__pycache__/Cal_Entropy.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13a002d7640a2d8c8fc427622be14d69e3da9d3c Binary files /dev/null and b/ch02-Decision-Tree/__pycache__/Cal_Entropy.cpython-36.pyc differ diff --git a/ch02-Decision-Tree/__pycache__/Classify_tree.cpython-36.pyc b/ch02-Decision-Tree/__pycache__/Classify_tree.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7aaf377a6c71718ba381bd232c003364bd2e95b8 Binary files /dev/null and b/ch02-Decision-Tree/__pycache__/Classify_tree.cpython-36.pyc differ diff --git a/ch02-Decision-Tree/__pycache__/Decision_Tree.cpython-36.pyc b/ch02-Decision-Tree/__pycache__/Decision_Tree.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4cf8ee354a77711a9b8fb178bf6ab3532b8f452 Binary files /dev/null and b/ch02-Decision-Tree/__pycache__/Decision_Tree.cpython-36.pyc differ diff --git a/ch02-Decision-Tree/__pycache__/Plot_tree.cpython-36.pyc b/ch02-Decision-Tree/__pycache__/Plot_tree.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..307ea3fd029b183232dde5b18b3b5e816140bcab Binary files /dev/null and b/ch02-Decision-Tree/__pycache__/Plot_tree.cpython-36.pyc differ diff --git a/ch02-Decision-Tree/__pycache__/Split_by_entropy.cpython-36.pyc b/ch02-Decision-Tree/__pycache__/Split_by_entropy.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dac0b8417101bb8ee7431fd2d2d2c48a4f57eefe Binary files /dev/null and b/ch02-Decision-Tree/__pycache__/Split_by_entropy.cpython-36.pyc differ diff --git a/ch02-Decision-Tree/lenses.txt b/ch02-Decision-Tree/lenses.txt new file mode 100644 index 0000000000000000000000000000000000000000..d034e37d55450fadad1bd2a9673991ade96c45c2 --- /dev/null +++ b/ch02-Decision-Tree/lenses.txt @@ -0,0 +1,24 @@ +young myope no reduced no lenses +young myope no normal soft +young myope yes reduced no lenses +young myope yes normal hard +young hyper no reduced no lenses +young hyper no normal soft +young hyper yes reduced no lenses +young hyper yes normal hard +pre myope no reduced no lenses +pre myope no normal soft +pre myope yes reduced no lenses +pre myope yes normal hard +pre hyper no reduced no lenses +pre hyper no normal soft +pre hyper yes reduced no lenses +pre hyper yes normal no lenses +presbyopic myope no reduced no lenses +presbyopic myope no normal no lenses +presbyopic myope yes reduced no lenses +presbyopic myope yes normal hard +presbyopic hyper no reduced no lenses +presbyopic hyper no normal soft +presbyopic hyper yes reduced no lenses +presbyopic hyper yes normal no lenses