tft每日頭條

 > 生活

 > python機器學習之決策樹算法

python機器學習之決策樹算法

生活 更新时间:2025-02-08 06:02:27

python機器學習之決策樹算法(機器學習經典算法)1

python機器學習之決策樹算法(機器學習經典算法)2

python機器學習之決策樹算法(機器學習經典算法)3

python機器學習之決策樹算法(機器學習經典算法)4

python機器學習之決策樹算法(機器學習經典算法)5

# -*- coding: utf-8 -*-

"""

Created on Sat Aug 25 10:39:22 2018

@author: aoanng

"""

from math import log

##創建數據集

def createdataSet():

"""

創建數據集

"""

dataSet = [['青年', '否', '否', '一般', '拒絕'],

['青年', '否', '否', '好', '拒絕'],

['青年', '是', '否', '好', '同意'],

['青年', '是', '是', '一般', '同意'],

['青年', '否', '否', '一般', '拒絕'],

['中年', '否', '否', '一般', '拒絕'],

['中年', '否', '否', '好', '拒絕'],

['中年', '是', '是', '好', '同意'],

['中年', '否', '是', '非常好', '同意'],

['中年', '否', '是', '非常好', '同意'],

['老年', '否', '是', '非常好', '同意'],

['老年', '否', '是', '好', '同意'],

['老年', '是', '否', '好', '同意'],

['老年', '是', '否', '非常好', '同意'],

['老年', '否', '否', '一般', '拒絕'],

]

featureName = ['年齡', '有工作', '有房子', '信貸情況']

# 返回數據集和每個維度的名稱

return dataSet, featureName

##分割數據集

def splitDataSet(dataSet,axis,value):

"""

按照給定特征劃分數據集

:param axis:劃分數據集的特征的維度

:param value:特征的值

:return: 符合該特征的所有實例(并且自動移除掉這維特征)

"""

# 循環遍曆dataSet中的每一行數據

retDataSet = []

for featVec in dataSet:

if featVec[axis] == value:

reduceFeatVec = featVec[:axis] # 删除這一維特征

reduceFeatVec.extend(featVec[axis 1:])

retDataSet.append(reduceFeatVec)

return retDataSet

##計算信息熵

# 計算的始終是類别标簽的不确定度

def calcShannonEnt(dataSet):

"""

計算訓練數據集中的Y随機變量的香農熵

:param dataSet:

:return:

"""

numEntries = len(dataSet) # 實例的個數

labelCounts = {}

for featVec in dataSet: # 遍曆每個實例,統計标簽的頻次

currentLabel = featVec[-1] # 表示最後一列

# 當前标簽不在labelCounts map中,就讓labelCounts加入該标簽

if currentLabel not in labelCounts.keys():

labelCounts[currentLabel] =0

labelCounts[currentLabel] =1

shannonEnt = 0.0

for key in labelCounts:

prob = float(labelCounts[key]) / numEntries

shannonEnt -= prob * log(prob,2) # log base 2

return shannonEnt

## 計算條件熵

def calcConditionalEntropy(dataSet,i,featList,uniqueVals):

"""

計算x_i給定的條件下,Y的條件熵

:param dataSet: 數據集

:param i: 維度i

:param featList: 數據集特征列表

:param unqiueVals: 數據集特征集合

:return: 條件熵

"""

ce = 0.0

for value in uniqueVals:

subDataSet = splitDataSet(dataSet,i,value)

prob = len(subDataSet) / float(len(dataSet)) # 極大似然估計概率

ce = prob * calcShannonEnt(subDataSet) #∑pH(Y|X=xi) 條件熵的計算

return ce

##計算信息增益

def calcInformationGain(dataSet,baseEntropy,i):

"""

計算信息增益

:param dataSet: 數據集

:param baseEntropy: 數據集中Y的信息熵

:param i: 特征維度i

:return: 特征i對數據集的信息增益g(dataSet | X_i)

"""

featList = [example[i] for example in dataSet] # 第i維特征列表

uniqueVals = set(featList) # 換成集合 - 集合中的每個元素不重複

newEntropy = calcConditionalEntropy(dataSet,i,featList,uniqueVals)#計算條件熵,

infoGain = baseEntropy - newEntropy # 信息增益 = 信息熵 - 條件熵

return infoGain

## 算法框架

def chooseBestFeatureToSplitByID3(dataSet):

"""

選擇最好的數據集劃分

:param dataSet:

:return:

"""

numFeatures = len(dataSet[0]) -1 # 最後一列是分類

baseEntropy = calcShannonEnt(dataSet) #返回整個數據集的信息熵

bestInfoGain = 0.0

bestFeature = -1

for i in range(numFeatures): # 遍曆所有維度特征

infoGain = calcInformationGain(dataSet,baseEntropy,i) #返回具體特征的信息增益

if(infoGain > bestInfoGain):

bestInfoGain = infoGain

bestFeature = i

return bestFeature # 返回最佳特征對應的維度

def createTree(dataSet,featureName,chooseBestFeatureToSplitFunc = chooseBestFeatureToSplitByID3):

"""

創建決策樹

:param dataSet: 數據集

:param featureName: 數據集每一維的名稱

:return: 決策樹

"""

classList = [example[-1] for example in dataSet] # 類别列表

if classList.count(classList[0]) == len(classList): # 統計屬于列别classList[0]的個數

return classList[0] # 當類别完全相同則停止繼續劃分

if len(dataSet[0]) ==1: # 當隻有一個特征的時候,遍曆所有實例返回出現次數最多的類别

return majorityCnt(classList) # 返回類别标簽

bestFeat = chooseBestFeatureToSplitFunc(dataSet)#最佳特征對應的索引

bestFeatLabel = featureName[bestFeat] #最佳特征

myTree ={bestFeatLabel:{}} # map 結構,且key為featureLabel

del (featureName[bestFeat])

# 找到需要分類的特征子集

featValues = [example[bestFeat] for example in dataSet]

uniqueVals = set(featValues)

for value in uniqueVals:

subLabels = featureName[:] # 複制操作

myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels)

return myTree

# 測試決策樹的構建

dataSet,featureName = createDataSet()

myTree = createTree(dataSet,featureName)

print(myTree)

可視化treePlotter.py文件:

# -*- coding: utf-8 -*-

"""

Created on Sat Aug 25 11:04:40 2018

@author: aoanng

"""

import matplotlib.pyplot as plt

# 定義文本框和箭頭格式

decisionNode = dict(boxstyle="round4", color='#3366FF') #定義判斷結點形态

leafNode = dict(boxstyle="circle", color='#FF6633') #定義葉結點形态

arrow_args = dict(arrowstyle="<-", color='g') #定義箭頭

#繪制帶箭頭的注釋

def plotNode(nodeTxt, centerPt, parentPt, nodeType):

createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',

xytext=centerPt, textcoords='axes fraction',

va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)

#計算葉結點數

def getNumLeafs(myTree):

numLeafs = 0

firstStr = list(myTree.keys())[0]

secondDict = myTree[firstStr]

for key in secondDict.keys():

if type(secondDict[key]).__name__ == 'dict':

numLeafs = getNumLeafs(secondDict[key])

else:

numLeafs = 1

return numLeafs

#計算樹的層數

def getTreeDepth(myTree):

maxDepth = 0

firstStr = list(myTree.keys())[0]

secondDict = myTree[firstStr]

for key in secondDict.keys():

if type(secondDict[key]).__name__ == 'dict':

thisDepth = 1 getTreeDepth(secondDict[key])

else:

thisDepth = 1

if thisDepth > maxDepth:

maxDepth = thisDepth

return maxDepth

#在父子結點間填充文本信息

def plotMidText(cntrPt, parentPt, txtString):

xMid = (parentPt[0] - cntrPt[0]) / 2.0 cntrPt[0]

yMid = (parentPt[1] - cntrPt[1]) / 2.0 cntrPt[1]

createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)

def plotTree(myTree, parentPt, nodeTxt):

numLeafs = getNumLeafs(myTree)

depth = getTreeDepth(myTree)

firstStr = list(myTree.keys())[0]

cntrPt = (plotTree.xOff (1.0 float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)

plotMidText(cntrPt, parentPt, nodeTxt) #在父子結點間填充文本信息

plotNode(firstStr, cntrPt, parentPt, decisionNode) #繪制帶箭頭的注釋

secondDict = myTree[firstStr]

plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD

for key in secondDict.keys():

if type(secondDict[key]).__name__ == 'dict':

plotTree(secondDict[key], cntrPt, str(key))

else:

plotTree.xOff = plotTree.xOff 1.0 / plotTree.totalW

plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)

plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))

plotTree.yOff = plotTree.yOff 1.0 / plotTree.totalD

def createPlot(inTree):

fig = plt.figure(1, facecolor='white')

fig.clf()

axprops = dict(xticks=[], yticks=[])

createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)

plotTree.totalW = float(getNumLeafs(inTree))

plotTree.totalD = float(getTreeDepth(inTree))

plotTree.xOff = -0.5 / plotTree.totalW;

plotTree.yOff = 1.0;

plotTree(inTree, (0.5, 1.0), '')

plt.show()

完整調用main.py:

# -*- coding: utf-8 -*-

"""

Created on Sat Aug 25 10:00:16 2018

@author: aoanng

"""

from pylab import *

import treePlotter

from ID3Tree import *

mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默認字體

mpl.rcParams['axes.unicode_minus'] = False # 解決保存圖像時負号'-'顯示為方塊的問題

##################################

# 測試決策樹的構建

myDat, labels = createDataSet()

myTree = createTree(myDat, labels)

# 繪制決策樹

treePlotter.createPlot(myTree)

python機器學習之決策樹算法(機器學習經典算法)6

,

更多精彩资讯请关注tft每日頭條,我们将持续为您更新最新资讯!

查看全部

相关生活资讯推荐

热门生活资讯推荐

网友关注

Copyright 2023-2025 - www.tftnews.com All Rights Reserved