# -*- coding: utf-8 -*-
"""
Created on Sat Aug 25 10:39:22 2018
@author: aoanng
"""
from math import log
##創建數據集
def createdataSet():
"""
創建數據集
"""
dataSet = [['青年', '否', '否', '一般', '拒絕'],
['青年', '否', '否', '好', '拒絕'],
['青年', '是', '否', '好', '同意'],
['青年', '是', '是', '一般', '同意'],
['青年', '否', '否', '一般', '拒絕'],
['中年', '否', '否', '一般', '拒絕'],
['中年', '否', '否', '好', '拒絕'],
['中年', '是', '是', '好', '同意'],
['中年', '否', '是', '非常好', '同意'],
['中年', '否', '是', '非常好', '同意'],
['老年', '否', '是', '非常好', '同意'],
['老年', '否', '是', '好', '同意'],
['老年', '是', '否', '好', '同意'],
['老年', '是', '否', '非常好', '同意'],
['老年', '否', '否', '一般', '拒絕'],
]
featureName = ['年齡', '有工作', '有房子', '信貸情況']
# 返回數據集和每個維度的名稱
return dataSet, featureName
##分割數據集
def splitDataSet(dataSet,axis,value):
"""
按照給定特征劃分數據集
:param axis:劃分數據集的特征的維度
:param value:特征的值
:return: 符合該特征的所有實例(并且自動移除掉這維特征)
"""
# 循環遍曆dataSet中的每一行數據
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reduceFeatVec = featVec[:axis] # 删除這一維特征
reduceFeatVec.extend(featVec[axis 1:])
retDataSet.append(reduceFeatVec)
return retDataSet
##計算信息熵
# 計算的始終是類别标簽的不确定度
def calcShannonEnt(dataSet):
"""
計算訓練數據集中的Y随機變量的香農熵
:param dataSet:
:return:
"""
numEntries = len(dataSet) # 實例的個數
labelCounts = {}
for featVec in dataSet: # 遍曆每個實例,統計标簽的頻次
currentLabel = featVec[-1] # 表示最後一列
# 當前标簽不在labelCounts map中,就讓labelCounts加入該标簽
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] =0
labelCounts[currentLabel] =1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob,2) # log base 2
return shannonEnt
## 計算條件熵
def calcConditionalEntropy(dataSet,i,featList,uniqueVals):
"""
計算x_i給定的條件下,Y的條件熵
:param dataSet: 數據集
:param i: 維度i
:param featList: 數據集特征列表
:param unqiueVals: 數據集特征集合
:return: 條件熵
"""
ce = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet,i,value)
prob = len(subDataSet) / float(len(dataSet)) # 極大似然估計概率
ce = prob * calcShannonEnt(subDataSet) #∑pH(Y|X=xi) 條件熵的計算
return ce
##計算信息增益
def calcInformationGain(dataSet,baseEntropy,i):
"""
計算信息增益
:param dataSet: 數據集
:param baseEntropy: 數據集中Y的信息熵
:param i: 特征維度i
:return: 特征i對數據集的信息增益g(dataSet | X_i)
"""
featList = [example[i] for example in dataSet] # 第i維特征列表
uniqueVals = set(featList) # 換成集合 - 集合中的每個元素不重複
newEntropy = calcConditionalEntropy(dataSet,i,featList,uniqueVals)#計算條件熵,
infoGain = baseEntropy - newEntropy # 信息增益 = 信息熵 - 條件熵
return infoGain
## 算法框架
def chooseBestFeatureToSplitByID3(dataSet):
"""
選擇最好的數據集劃分
:param dataSet:
:return:
"""
numFeatures = len(dataSet[0]) -1 # 最後一列是分類
baseEntropy = calcShannonEnt(dataSet) #返回整個數據集的信息熵
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures): # 遍曆所有維度特征
infoGain = calcInformationGain(dataSet,baseEntropy,i) #返回具體特征的信息增益
if(infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature # 返回最佳特征對應的維度
def createTree(dataSet,featureName,chooseBestFeatureToSplitFunc = chooseBestFeatureToSplitByID3):
"""
創建決策樹
:param dataSet: 數據集
:param featureName: 數據集每一維的名稱
:return: 決策樹
"""
classList = [example[-1] for example in dataSet] # 類别列表
if classList.count(classList[0]) == len(classList): # 統計屬于列别classList[0]的個數
return classList[0] # 當類别完全相同則停止繼續劃分
if len(dataSet[0]) ==1: # 當隻有一個特征的時候,遍曆所有實例返回出現次數最多的類别
return majorityCnt(classList) # 返回類别标簽
bestFeat = chooseBestFeatureToSplitFunc(dataSet)#最佳特征對應的索引
bestFeatLabel = featureName[bestFeat] #最佳特征
myTree ={bestFeatLabel:{}} # map 結構,且key為featureLabel
del (featureName[bestFeat])
# 找到需要分類的特征子集
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = featureName[:] # 複制操作
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
return myTree
# 測試決策樹的構建
dataSet,featureName = createDataSet()
myTree = createTree(dataSet,featureName)
print(myTree)
可視化treePlotter.py文件:
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 25 11:04:40 2018
@author: aoanng
"""
import matplotlib.pyplot as plt
# 定義文本框和箭頭格式
decisionNode = dict(boxstyle="round4", color='#3366FF') #定義判斷結點形态
leafNode = dict(boxstyle="circle", color='#FF6633') #定義葉結點形态
arrow_args = dict(arrowstyle="<-", color='g') #定義箭頭
#繪制帶箭頭的注釋
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',
xytext=centerPt, textcoords='axes fraction',
va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)
#計算葉結點數
def getNumLeafs(myTree):
numLeafs = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
numLeafs = getNumLeafs(secondDict[key])
else:
numLeafs = 1
return numLeafs
#計算樹的層數
def getTreeDepth(myTree):
maxDepth = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
thisDepth = 1 getTreeDepth(secondDict[key])
else:
thisDepth = 1
if thisDepth > maxDepth:
maxDepth = thisDepth
return maxDepth
#在父子結點間填充文本信息
def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0] - cntrPt[0]) / 2.0 cntrPt[0]
yMid = (parentPt[1] - cntrPt[1]) / 2.0 cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)
def plotTree(myTree, parentPt, nodeTxt):
numLeafs = getNumLeafs(myTree)
depth = getTreeDepth(myTree)
firstStr = list(myTree.keys())[0]
cntrPt = (plotTree.xOff (1.0 float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt) #在父子結點間填充文本信息
plotNode(firstStr, cntrPt, parentPt, decisionNode) #繪制帶箭頭的注釋
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
plotTree(secondDict[key], cntrPt, str(key))
else:
plotTree.xOff = plotTree.xOff 1.0 / plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff 1.0 / plotTree.totalD
def createPlot(inTree):
fig = plt.figure(1, facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5 / plotTree.totalW;
plotTree.yOff = 1.0;
plotTree(inTree, (0.5, 1.0), '')
plt.show()
完整調用main.py:
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 25 10:00:16 2018
@author: aoanng
"""
from pylab import *
import treePlotter
from ID3Tree import *
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默認字體
mpl.rcParams['axes.unicode_minus'] = False # 解決保存圖像時負号'-'顯示為方塊的問題
##################################
# 測試決策樹的構建
myDat, labels = createDataSet()
myTree = createTree(myDat, labels)
# 繪制決策樹
treePlotter.createPlot(myTree)
更多精彩资讯请关注tft每日頭條,我们将持续为您更新最新资讯!