# E) d4 c1 O% f% Fimport numpy as np& a9 d9 Q# f5 X" U& i7 [' L5 I( C
from matplotlib import pyplot as plt$ e, y% e9 D }1 R) j3 b3 G
from math import log 4 V+ T! {+ ^" W5 O" n% D# q4 h1 zimport pandas as pd' G! i8 }8 N' T4 \ b( ?
import pydotplus as pdp / e0 H8 ^/ C+ b7 N* x4 b ! d M! X9 {# I k( p! { g7 [9 N"""1 M+ M2 B8 m# w. M
19335286 郑有为 , l! N% w) J4 i7 L6 v2 B: M人工智能作业 - 实现ID3决策树 . ^: Z! S- j4 i: P" F3 z, R- o/ t""" 6 S- x( d* K. [9 [0 j" P2 U: q7 g3 @
nonce = 0 # 用来给节点一个全局ID" w% j+ ?' F* N/ F
color_i = 0 ) t, m+ ~% h: G) l# 绘图时节点可选的颜色, 非叶子节点是蓝色的, 叶子节点根据分类被赋予不同的颜色2 A+ C L1 `3 s5 q. O
color_set = ["#AAFFDD", "#DDAAFF", "#DDFFAA", "#FFAADD", "#FFDDAA"]8 `, ]4 D4 k( W, A3 o
4 A, W0 j+ }: q
# 载入汽车数据, 判断顾客要不要买 - S6 z. s/ q4 j0 ]2 I' B) Vclass load_car: g* h+ E# b6 z # 在表格中,最后一列是分类结果 0 M+ U5 ]) V+ ]: z # feature_names: 属性名列表 7 b2 w4 j1 N* j # target_names: 标签(分类)名 + m/ ~6 d; i/ d( P! {. }, J # data: 属性数据矩阵, 每行是一个数据, 每个数据是每个属性的对应值的列表9 s# Q: Y( T5 _* Z
# target: 目标分类值列表 ( H: N) u" z+ Q$ G" F) [+ l def __init__(self):, s. n9 e8 l9 J% J
df = pd.read_csv('../dataset/car/car_train.csv')% T; t) G' K9 C! e3 E
labels = df.columns.values & K0 T4 _! {+ l8 B data_array = np.array(df[1:]) * B8 J9 Z# m. g9 I8 @ self.feature_names = labels[0:-1] : D+ h4 ^! W3 E) k, ~5 B4 X+ n self.target_names = labels[-1]5 V( F6 F) q: k
self.data = data_array[0:,0:-1] / W* G5 s( X! [' X/ ~# }& T self.target = data_array[0:,-1]! q; k, x4 x. e2 }; h" f- W& R
9 |) O* y+ V# z
# 载入蘑菇数据, 鉴别蘑菇是否有毒 " g. v' m! a" f2 K7 Jclass load_mushroom: 0 Q4 }0 X; Y* b; x' v+ S0 g # 在表格中, 第一列是分类结果: e 可食用; p 有毒.1 \4 s+ |6 J0 Q' S) X* e/ }
# feature_names: 属性名列表 1 p8 t$ m8 u8 P1 U # target_names: 标签(分类)名) A; |6 z" B. A! Z2 A$ E
# data: 属性数据矩阵, 每行是一个数据, 每个数据是每个属性的对应值的列表& \! j r) ? C. ^5 W9 t$ c
# target: 目标分类值列表 4 q' s: F) V( l' _ def __init__(self): ) o$ k5 ~0 S9 |9 F! K# d& r8 {5 n df = pd.read_csv('../dataset/mushroom/agaricus-lepiota.data')& N% ~1 Q+ Z" M; _. g m* W3 _
data_array = np.array(df) $ t2 O0 y) c, S9 T2 ~ labels = ["edible/poisonous", "cap-shape", "cap-surface", "cap-color", "bruises", "odor", "gill-attachment",4 M( n5 z: s1 M" w; h2 n
"gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring", - q3 v1 Y- d! k1 k C9 H$ D5 P "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring"," m6 ^8 u' b% \- G
"veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat"]6 o2 ^5 Y# |1 [* T) W6 L& o
self.feature_names = labels[1:]0 ^( ~; k% b3 c+ @
self.target_names = labels[0]' u1 U3 j0 k# t3 \( [1 S" T
self.data = data_array[0:,1:] 6 E' ~1 x& I; ]: G9 R3 S self.target = data_array[0:,0] 8 O# L( b' p3 ?& @/ r) z {* `$ |* t6 z) u3 ]8 { d3 l
# 创建一个临时的子数据集, 在划分测试集和训练集时使用 ' x: w5 n) i% Z* o( I9 n* Z$ m7 L! g/ F! Xclass new_dataset:) F& N# R+ C: b; W& _ m% P
# feature_names: 属性名列表3 [- a9 t0 v( ~9 G4 b
# target_names: 标签(分类)名" t0 I$ ?1 W r# c7 a
# data: 属性数据矩阵, 每行是一个数据, 每个数据是每个属性的对应值的列表 3 v: \$ Q1 |$ m+ g2 j # target: 目标分类值列表1 X; D( y( p, O: P* ^
def __init__(self, f_n, t_n, d, t):+ U) C7 s" x9 T* i) O% Z
self.feature_names = f_n & s# C& c( Y' X7 s2 P2 C self.target_names = t_n " s" ?% L' R* e self.data = d0 b1 P+ [. t3 P2 D% J& b) [
self.target = t % \' I- b; I8 N5 u& s 2 h7 ^$ U" s3 J( K% p) p7 o0 `) t1 x# 计算熵, 熵的数学公式为: $H(V) = - \sum_{k} P(v_k) \log_2 P(v_k)$$ l, Y) n$ c! i( O7 I
# 其中 P(v_k) 是随机变量 V 具有值 V_k 的概率 ' j" [& F3 Y# \- ^: l# R# target: 分类结果的列表, return: 信息熵 ) p3 ?' ^" ?0 ~4 Z ]" W4 x& u/ Odef get_h(target):6 u: v6 L1 }) `$ u3 q, L5 r0 M; C
target_count = {}! Z% \( H" t/ u9 l% ~
for i in range(len(target)):; X' H" e3 }/ G: \8 |
label = target. ^; f5 }. v7 ~- M5 r
if label not in target_count.keys():+ I% }9 ~4 i6 {! R
target_count[label] = 1.0 ) J9 ?) x; c j3 e else: 7 }5 q6 X: [- Q# P6 r target_count[label] += 1.0 4 m- r* P6 f& F2 J8 S) y' g h = 0.07 d Z7 ?# P& S- i8 \8 u
for k in target_count: 9 m' G: K: Q, F J5 n p = target_count[k] / len(target)5 D7 O h/ U! ]
h -= p * log(p, 2) % ]& d) r! t; |( Q% \) C return h' E$ Y* \; S/ h% K2 G' ]/ N1 b* q
: F `5 ]# ^: |# e# 取数据子集, 选择条件是原数据集中的属性 feature_name 值是否等于 feature_value - @8 ?- K/ H8 s" S3 B+ J( [4 m5 X# 注: 选择后会从数据子集中删去 feature_name 属性对应的一列 $ `- }& L9 x+ }: edef get_subset(dataset, feature_name, feature_value):( H) _. p% ^ u
sub_data = []9 D p. x6 z& _3 b
sub_target = []' w5 H( B1 p! y/ r
f_index = -1. p- s3 {2 o8 o
for i in range(len(dataset.feature_names)): 4 \ i8 ^- f, N( C t+ z) o- C if dataset.feature_names == feature_name:0 k; x$ F4 M; j" `; [8 V5 c9 O
f_index = i # Y* M- r: f" ~ break ' a( F* K7 L; B2 |2 ?. ?% X. H7 c4 L- ^* s
for i in range(len(dataset.data)):; `4 v0 v& y% H7 b. g/ o5 a4 p
if dataset.data[f_index] == feature_value: , W+ C5 m }" N3 a l = list(dataset.data[:f_index])& h' W5 j4 h" H% E
l.extend(dataset.data[f_index+1:])5 e0 S; y* y o6 m
sub_data.append(l) 9 q1 y1 `& z+ }+ A6 Y _: s0 |3 _ sub_target.append(dataset.target)/ s7 y/ q" O3 M) E8 F( `: G
' Z9 n0 ^* R+ y$ d) y
sub_feature_names = list(dataset.feature_names[:f_index])1 e/ C# \ R! D
sub_feature_names.extend(dataset.feature_names[f_index+1:]) ( `- z9 J5 {/ `' m3 y/ f% y3 P0 x1 w return new_dataset(sub_feature_names, dataset.target_names, sub_data, sub_target) # S( U( K* {* R6 U7 c/ ~) ]( B3 m' l6 _ + h H6 D& s- R5 M# m# 寻找并返回信息收益最大的属性划分, H8 l: @8 R$ D% J7 X
# 信息收益值划分该数据集前后的熵减5 h" A& I* i7 |9 x) \0 C* ~
# 计算公式为: Gain(A) = get_h(ori_target) - sum(|sub_target| / |ori_target| * get_h(sub_target))$, i- V4 X% ~0 h
def best_spilt(dataset): # |' C6 A) Z! ~ 5 D3 o5 b6 P# Q base_h = get_h(dataset.target) / b8 T: t8 p; P8 B/ m3 X& c best_gain = 0.0 0 b, _6 `6 y, V7 P. Z best_feature = None0 v5 V7 S6 r$ _: n7 S5 _
for i in range(len(dataset.feature_names)):/ ~8 I3 ?/ d3 J
feature_range = [] ' o: v0 |( w9 v7 X9 \7 q# B for j in range(len(dataset.data)): 1 K+ s: [9 k9 O4 }$ r' L if dataset.data[j] not in feature_range:& w0 y) M2 E/ S! b
feature_range.append(dataset.data[j]) 6 X2 y- }: q5 X" t0 S0 F/ }4 a8 M1 r3 A' n7 z
spilt_h = 0.0, P3 r) Z2 G+ G
for feature_value in feature_range: + ~* [6 [- B! s7 q8 k; T8 _ subset = get_subset(dataset, dataset.feature_names, feature_value) + w$ L/ M6 }8 P spilt_h += len(subset.target) / len(dataset.target) * get_h(subset.target) + V' H2 C) f* R2 V3 n1 I6 _8 Y4 E. {: _# w0 s& E, ]
if best_gain <= base_h - spilt_h:0 k" ]" A- h8 d; Y
best_gain = base_h - spilt_h! c3 U/ p5 x, _% ~7 t
best_feature = dataset.feature_names( f ?" \1 L: Z# S. w- t; x
+ F; n q6 K! x6 V4 o5 { return best_feature 3 x: Y2 u4 C" `) }0 p: F. P, ? 7 ]% y/ {9 w' V) V; O" N0 a# 返回数据集中一个数据最可能的标签5 i% R @% c! |$ H6 U
def vote_most(dataset): * w D& B% Q4 t5 b3 m* S target_range = {}3 R; h& G6 u4 ~; j: x {9 t) x
best_target = None0 x$ A, M. |$ U* E* ~# S. A
best_vote = 0/ a( Y% |, _/ Z2 C
4 I3 P( P, v. y
for t in dataset.target: 5 N0 I7 N9 w W- D if t not in target_range.keys():* H$ n9 x" a( O7 T4 J2 a o( P
target_range[t] = 1 " C9 S+ x+ l! _2 v" p9 O else: / D; p+ e& s( A target_range[t] += 1 ) }, w& e2 k+ y. `5 h+ y# \/ Q* t) G y$ s# t N
for t in target_range.keys(): ' n! R. F1 P+ v" H if target_range[t] > best_vote:7 n1 Z. O0 C' \' l) a- I
best_vote = target_range[t] : a) \* i: w& v& V) o% @. [7 ]. [ best_target = t' Z+ u: U# ~- U+ Z2 H
7 _7 w1 g# l) \5 P8 e- } return best_target ' Z3 e* P x U! z& c2 P+ ?! ]# h& x* S1 i* }
# 返回测试的正确率 0 H( ~% ?) Y5 J. k# predict_result: 预测标签列表, target_result: 实际标签列表: x2 _6 x, _4 f8 M+ }3 C
def accuracy_rate(predict_result, target_result):+ ~/ K( G8 O! F! p; `0 q3 I# @. j
# print("Predict Result: ", predict_result) ' T2 L% Z' S, ^5 I' m& a {, c # print("Target Result: ", target_result) # G6 ?' t( A+ I" k$ ]( X9 P8 } accuracy_score = 0) l) b5 Q: ?. y5 \ s; C
for i in range(len(predict_result)):# K1 O" K3 _& S3 }: y! {" E) b
if predict_result == target_result: ; @9 l9 k' G4 e3 H& t accuracy_score += 16 B6 [) c: e+ z4 X2 j5 B
return accuracy_score / len(predict_result) : y% L; a9 f9 Z# `7 b% q1 w# c" @
# 决策树的节点结构' W; t ?9 m# q- B! _ Z
class dt_node: & X& u1 }/ F" i X/ \2 p' C# T" Q' f0 a8 ^; m- y
def __init__(self, content, is_leaf=False, parent=None):2 E) k! `7 X# l& U$ n
global nonce6 U8 |; q3 B6 b
self.id = nonce # 为节点赋予一个全局ID, 目的是方便画图 / h' t. d. O8 X# A nonce += 18 }0 u0 U/ \7 {! U' s
self.feature_name = None 6 B* X) g& c+ }% E" K! j self.target_value = None* P% S& s' p. x0 f
self.vote_most = None # 记录当前节点最可能的标签 2 c8 d, z' V- [7 E3 V* A" C. j if not is_leaf:; _: D. s+ n+ i6 t' T
self.feature_name = content # 非叶子节点的属性名 3 i: M9 i' _ Q% e# T else: 7 J# ^( \3 ~: g# a! q3 e self.target_value = content # 叶子节点的标签 , X3 \% Q! d% d( c& m" N0 I0 s 9 g) u# L$ y. \+ T5 P& D3 R1 |6 R self.parent = parent, J2 a4 S& i1 d. o" g/ L
self.child = {} # 以当前节点的属性对应的属性值作为键值 ( X+ W, Z( v- e; L/ m. z# T; p; c$ ~: @+ i. R+ n& q; A
# 决策树模型 6 j6 I( z3 |9 M0 o' z |. Qclass dt_tree:4 u2 l) b) w' X. V1 z$ e
9 i' \) I+ f, r! s def __init__(self): 0 C6 x% I0 f; O6 V7 s self.tree = None # 决策树的根节点 1 I, ]# \9 q' p( R% o self.map_str = """ # F# D* N3 e f1 k digraph demo{+ ]. e \$ f. x( j9 Q9 t
node [shape=box, style="rounded", color="black", fontname="Microsoft YaHei"]; . I% V* M/ R; x/ a8 ?$ J edge [fontname="Microsoft YaHei"]; : o$ z1 Y- F' P2 W """ # 用于作图: pydotplus 格式的树图生成代码结构/ N* E0 d: u( X1 N' _( U0 a9 L$ l% J, i
self.color_dir = {} # 用于作图: 叶子节点可选颜色, 以标签值为键值 5 E3 c: v6 ]0 N2 c. v " y& {8 D( C( D+ Q: F # 训练模型, train_set: 训练集" L/ i( Z6 |1 ^3 m) }
def fit(self, train_set): ; X& W+ p5 M" o7 U; V* I; Q* g, d4 q6 M4 `) H* M# W7 F4 o6 R
if len(train_set.target) <= 0: # 如果测试集数据为空, 则返回空节点, 结束递归 - A: ]7 |9 M& @2 o. M8 M return None * H) F3 B( G# s7 R$ l+ ]- i! K4 s$ k/ [. `" @" I: |
target_all_same = True3 w0 {+ J3 W5 a! y& v' s l
for i in train_set.target:) X& W# F6 j3 E6 V9 y6 d% o
if i != train_set.target[0]: , Y2 g9 {/ Q, n$ ^ target_all_same = False$ ]9 A, I& l3 Z+ l! [
break1 ^4 v# c8 Y+ z1 A
0 c/ R J+ D! o
if target_all_same: # 如果测试集数据中所有数据的标签相同, 则构造叶子节点, 结束递归: q% h5 `1 Y9 [, @, d5 @
node = dt_node(train_set.target[0], is_leaf=True) , ~8 U) h O9 u. X% j* ^7 c; z if self.tree == None: # 如果根节点为空,则让该节点成为根节点3 u' \ R4 {1 y. m
self.tree = node 1 v( H& @$ f; J# Q5 c$ a. g0 w4 r/ Q- h
# 用于作图, 更新 map_str 内容, 为树图增加一个内容为标签值的叶子节点4 Y5 j! F( H( V
node_content = "标签:" + str(node.target_value) - ^1 y! _% b- R# ^+ f; v. h self.map_str += "id" + str(node.id) + "[label=\"" + node_content + "\", fillcolor=\"" + self.color_dir[node.target_value] + "\", style=filled]\n"/ H3 c1 f3 q) y, o) b
5 ^/ o* E% l; {; n return node+ K5 {4 s1 X" x/ a/ I5 V: W
elif len(train_set.feature_names) == 0: # 如果测试集待考虑属性为空, 则构造叶子节点, 结束递归 0 C2 e6 t! ^& g. m5 p node = dt_node(vote_most(train_set), is_leaf=True) # 这里让叶子结点的标签为概率上最可能的标签6 F0 j( e4 o6 @1 m7 \8 i+ y
if self.tree == None: # 如果根节点为空,则让该节点成为根节点 8 c; O5 X7 v5 x. `1 A self.color_dir[vote_most(train_set)] = color_set[0] # ^$ I4 G. q" l$ B' N: `& _ self.tree = node9 D- i! o0 }$ _% W2 \
/ [, T8 Q/ L( @" @# O
# 用于作图, 更新 map_str 内容, 为树图增加一个内容为标签值的叶子节点% k, k, K3 e' g v9 d7 N* R) W
node_content = "标签:" + str(node.target_value)7 e* M! _. v, y4 z+ s" M
self.map_str += "id" + str(node.id) + "[label=\"" + node_content + "\", fillcolor=\"" + self.color_dir[node.target_value] + "\", style=filled]\n" 9 E- A+ {, L- T4 g8 ?7 D- i, q& \3 R, e# J6 ^- ^6 A4 @7 T" K* W
return node; |. C$ A- V2 Z# g
else: # 普通情况, 构建一个内容为属性的非叶子节点$ w8 O9 X3 x6 h' f/ ~# h
best_feature = best_spilt(train_set) # 寻找最优划分属性, 作为该结点的值# y4 G0 s- ~4 A2 g' v
best_feature_index = -1 " U7 f+ q# s( W; ]* y) W5 _ for i in range(len(train_set.feature_names)):) p' _1 T. h! [1 F: v
if train_set.feature_names == best_feature: ( j, G# F7 \- y' m) h1 {" I best_feature_index = i; y U% |( F4 v. o9 g( K' h
break $ P+ ?4 w! q& D# i# B$ I6 p* S n: d, c, v. W
node = dt_node(best_feature) 5 c" d: S+ C: X7 N, E. P/ |$ w node.vote_most = vote_most(train_set)( |. p: e1 e; {
if self.tree == None: # 如果根节点为空,则让该节点成为根节点+ z( X9 i1 E. E% j+ p; D9 i( R$ q
self.tree = node" f- N& J6 c: O+ F6 j- x
# 用于作图, 初始化叶子节点可选颜色% X! F8 A( E4 u2 g1 H( `9 _+ e4 Z
for i in range(len(train_set.target)): - U; r3 Y' N. s* T2 F' k9 r if train_set.target not in self.color_dir:8 B$ |! n, J7 i, e: `5 t
global color_i * f+ y7 D7 w8 P( c2 q; H self.color_dir[train_set.target] = color_set[color_i] $ L5 h" ]2 A% \! c color_i += 19 c2 ^, H6 \3 g$ T: n% X" ~
color_i %= len(color_set)$ m2 M4 R h- b# J" i, e
- w" |( D0 J3 u; D5 k1 P9 j5 c6 B
feature_range = [] # 获取该属性出现在数据集中的可选属性值 % B& t7 o, I2 b( k for t in train_set.data:6 e: a9 B# ?5 @ G5 c# g
if t[best_feature_index] not in feature_range:. e/ ]4 k G! O* P) U
feature_range.append(t[best_feature_index]) $ U2 H* D) K$ z4 B3 p# X' v: i$ N! O9 |3 z
# 用于做图, 创建一个内容为属性的非叶子节点6 K2 }" l9 X2 x; Z# I
node_content = "属性:" + node.feature_name! f( {( d/ N2 G$ o* l" S, T
self.map_str += "id" + str(node.id) + "[label=\"" + node_content + "\", fillcolor=\"#AADDFF\", style=filled]\n", e; W* I( D$ D Y2 `
0 C; \& }6 h0 y) o. L. I: t P' @
for feature_value in feature_range:; ~. q$ C6 k1 y: V$ r2 m
subset = get_subset(train_set, best_feature, feature_value) # 获取每一个子集 W4 A' C1 [' p. C8 _9 i node.child[feature_value] = self.fit(subset) # 递归调用 fit 函数生成子节点 q: }9 T( W2 N* s& o: D: K
if node.child[feature_value] == None: 9 l: v2 p' G' `5 a # 如果创建的子节点为空, 则创建一个叶子节点作为其子节点, 其中标签值为概率上最可能的标签; x1 f0 }' _% n; j
node.child[feature_value] = dt_node(vote_most(train_set), is_leaf=True) ) F( D6 L( X/ U+ a$ }9 Y: q node.child[feature_value].parent = node" s$ q/ r( f$ ]* T
" E: u4 [ Y r2 L' g # 用于做图, 创建当前节点到所有子节点的连线7 Q' E( \+ p% y0 S9 u
self.map_str += "id" + str(node.id) + " -> " + "id" + str(node.child[feature_value].id) + "[label=\"" + str(feature_value) + "\"]\n", ?! h! s- N+ B, X4 ]' n; V
; B P$ [' K0 V" k
# print("Rest Festure: ", train_set.feature_names) 6 q% {4 q/ C t6 }: Z- p # print("Best Feature: ", best_feature_index, best_feature, "Feature Range: ", feature_range) ) y# C3 R2 g- r- j # for feature_value in feature_range: 5 `# {+ W! T$ |5 W # print("Child[", feature_value, "]: ", node.child[feature_value].feature_name, node.child[feature_value].target_value)1 P7 J8 e d' f5 g& R
return node ! D# P; ` v* y7 X, e" v + P: f k. Z) u # 测试模型, 对测试集 test_set 进行预测9 e/ O+ s; j J6 O
def predict(self, test_set):" r; P/ @4 l R9 i! h0 {5 n
test_result = [] C1 K! ^. b. h6 Q for test in test_set.data: 3 w5 p1 p$ R; ~% d- {: m node = self.tree # 从根节点一只往下找, 知道到达叶子节点 ) L3 `. Q! x q1 `2 h9 g, q while node.target_value == None: & J, J! v! p0 ]2 b7 @: A! X feature_name_index = -1' P6 ^, h [' d; t# V+ [
for i in range(len(test_set.feature_names)): 9 ^. D. Y5 ^: C, _7 L4 n2 a+ U if test_set.feature_names == node.feature_name:! O% Q% M' [6 E9 S! N6 E
feature_name_index = i 7 c4 J0 L1 y' R) R1 U break t# c; q/ I3 F: }, \7 ` if test[feature_name_index] not in node.child.keys():) X: r7 ~0 ~5 F' }. C
break( F, J* m: m" K* ^8 o
else: q# K: ~4 A P! J; G2 H node = node.child[test[feature_name_index]] : M% z5 j$ u. r, L4 H2 E% o : ?7 [+ T" J$ v5 }* ? if node.target_value == None:0 ?( _, h6 g2 K) Y( h, ?- N
test_result.append(node.vote_most)7 }! u! c0 }! F# |0 Q( x
else: # 如果没有到达叶子节点, 则取最后到达节点概率上最可能的标签为目标值 & I6 g$ l/ S0 n9 \- T test_result.append(node.target_value) # k$ C; o6 z8 f% t9 k2 x 1 U" F" {/ B( J7 `2 B' c4 I return test_result) o+ U" v( i* y! G: ^& ` |
" H$ w- P. I* D- H% p9 q: B' L% c # 输出树, 生成图片, path: 图片的位置5 E6 U+ I2 s+ H1 K" U U. V& Y
def show_tree(self, path="demo.png"):8 i1 i/ j5 ]+ C4 ?: f
map = self.map_str + "}"; x* o( b7 m. Z3 ]: r6 p$ `
print(map) ; |6 G$ A: D% H9 q' h6 l graph = pdp.graph_from_dot_data(map) + K. J7 P+ z1 O2 F: V- J5 w" T graph.write_png(path)4 g8 Y0 ?; x x/ m1 d3 U
, [5 q v# Y1 a. o( B# 学习曲线评估算法精度 dataset: 数据练集, label: 纵轴的标签, interval: 测试规模递增的间隔- N& g" o, z0 O; K- Q) F6 Z. Q2 X$ }
def incremental_train_scale_test(dataset, label, interval=1):, k; |. q! D; X0 j
c = dataset+ u A7 |: F3 S i1 o/ R6 \
r = range(5, len(c.data) - 1, interval) / O1 V" f$ X8 ` rates = [] ! \9 b6 d* {' } K, _. g6 N- m% g for train_num in r:. H8 k3 [# X9 J8 ]7 i) L$ i
print(train_num) z) N' N) ~/ }) D { train_set = new_dataset(c.feature_names, c.target_names, c.data[:train_num], c.target[:train_num])! D4 Q) \3 O7 A( [' e
test_set = new_dataset(c.feature_names, c.target_names, c.data[train_num:], c.target[train_num:])1 A. p6 e& w( _" @
dt = dt_tree() : d. f4 _% ~1 `1 a1 @, f5 D dt.fit(train_set) & |7 J1 }' t; T8 G/ f3 Z rates.append(accuracy_rate(dt.predict(test_set), list(test_set.target))) - p& a$ K+ I5 }1 Z7 j( ?) }) X! H6 r7 g9 F7 k6 L
print(rates) ' {5 [( a; a5 K+ C7 C plt.plot(r, rates) - ~* u; _, X5 N1 v plt.ylabel(label)$ W0 M. q( _" Z" L6 w
plt.show() - H- r$ S7 }2 I* x N; h% J& w& M, G* ?' I3 Y: eif __name__ == '__main__':7 P- C' V+ b) H+ O: r. e
$ ?2 V5 N9 L& @$ U G# {4 e, N! c c = load_car() # 载入汽车数据集' f3 _0 J, ^% F& H ~
# c = load_mushroom() # 载入蘑菇数据集/ }9 Q+ F% Y9 W( o/ l7 v [4 m
train_num = 1000 # 训练集规模(剩下的数据就放到测试集)5 p8 L( |, Y/ v9 n6 _8 J% m
train_set = new_dataset(c.feature_names, c.target_names, c.data[:train_num], c.target[:train_num]) % `& V" D% t( V* ] test_set = new_dataset(c.feature_names, c.target_names, c.data[train_num:], c.target[train_num:]) 9 L% q, l4 N) l Y/ `1 [7 Z7 B! e8 M: |( p2 Y
dt = dt_tree() # 初始化决策树模型4 C. G, Y( }7 L9 u, i) e6 Z
dt.fit(train_set) # 训练 ( o8 d7 G4 z( c% c4 Y. ]3 b dt.show_tree("../image/demo.png") # 输出决策树图片7 t$ W4 \8 ^, s8 n& s7 B
print(accuracy_rate(dt.predict(test_set), list(test_set.target))) # 进行测试, 并计算准确率吧 " V7 l6 `+ O% t8 |5 {, f w 4 M. n" E0 u# `& p6 R: g4 o # incremental_train_scale_test(load_car(), "car"). x0 s+ l% H% t) l0 c- p
# incremental_train_scale_test(load_mushroom(), "mushroom", interval=20) 8 Q- }. Z1 k" x( W* z: I+ X, {6 }7 ~8 F8 \0 j
6 n$ K6 W8 i% _: s
8 C" W; _" [, H( D% F2 r
1 ' G6 z/ ^8 f. a: r2' t# a& F: A, R @2 R. h5 V/ F
3& z7 \" X+ w$ q- |& `) ?
4 " C' \0 ?# Y9 u% J1 o. X1 j5: [# y1 Y' a5 y
6! [8 X5 g3 p. N
7 ! p1 X8 Z- r" @" D$ }, g% ^8 % z+ Q ]/ v, \2 a2 F9 8 T$ V' A- P$ z10 . i | `. U0 l; y+ Y$ [. U11 " o W6 r h+ [, L: V" O6 b12 6 h3 F3 N9 a9 Y) }5 f9 _7 W13( u3 Y2 V$ M* E
14 4 ^6 H& m9 d" W+ y: R+ _15 ) ], j" m# C- W" h8 Z; s16 2 x w4 M8 ?6 ?0 F17 6 C& ]+ ? P5 [# k18$ S. I, x$ A9 r, A* |5 K
197 Z5 O8 P/ I2 b5 A1 H
20 " E1 J& s, \% @3 G0 x6 U, T21 0 z0 d- A/ u( |4 ?" x9 G22 $ {1 o5 X" U% Z) ]2 _230 w: F! t) {8 c' z
24 % E% t; m! v6 h% w4 V/ p25 . v9 B) `+ i# \% n. \26; W# C! |+ Y! \& T& i( e9 I
27: T# K0 b D" j3 o( @/ n1 l& J
289 @. m3 _3 K2 e, P
29, ]& A0 E/ F* D9 u
30" |) h$ |: z5 _
31& O' q0 o7 D- R) _3 p
328 A! Q6 }1 |0 P$ z& M7 w
33; S) k B6 `, }) v( t
34 8 ^7 i' |! T1 Z& q/ c35) d- V6 Q6 V* A; V: M: k; _7 O
362 e3 m& q4 X) Z: r/ m
37 7 V" Z! s J. q( }. \& B( k" T+ p" a38# G2 d+ B% Y; ?! A
39' @! d: m7 N w# V" z# u& h
40 # c5 P: g9 u* t41! N, E* ]' w0 _# u8 F, U9 }
42- B- R$ i' P" l' x
434 a; U1 m1 r: y1 I
44 X7 R! O- j6 J" _& F+ I J
45 . U" c' i9 [+ H9 t7 [5 a46 / u, X7 Z% a8 M47) e; W% \% n: ?8 R1 O' P0 `
48 & F6 }) V7 S0 ]0 ~49& ` i. `) r6 P! P% @& u' i" ~
50 7 Q. g: j- v0 Y' a' r51/ K6 g+ l! H: i
52 . |7 V5 G! A" _53% d/ Y! U. y/ `5 N
54 0 ?- `# @* R& p2 k( e55) b3 p+ h& H0 h5 t: M
56+ e% B/ b& `% v6 p4 G8 C
57 - f; i- `4 [; n/ e X' X589 W0 p$ ] ~) c+ s$ ?9 B* F% [3 ?$ p
59 8 u8 p4 p8 p+ w6 v' n603 K# d- H9 T) E! i( E* ?
61 ! J. ?& @( O6 G- V628 f4 [7 T7 j3 a# s& Y- V$ B; M
63 4 u3 I. ~5 \# c s$ \4 o" [. F64 3 k p9 X c) H" K65 ) R, p1 B# \6 K& J6 {* |; E# ]+ M663 `* ]) _( M2 y& Q! E
67 9 E- o6 T. P5 ]" b" w4 x0 b68 1 P2 n1 s, k; _: w698 a0 z- {( k5 X# R J" ?4 s. V
70' K! a N+ R$ X4 t) @1 Q
71 / @) |0 X. ]1 V- }$ R722 u/ S& \' e& {3 H/ ]% W
73 . D! n( C6 {/ `; e' K743 E+ H6 k4 W8 u V. e
75 " ^$ ?" B$ v! S. r0 v6 b- w/ W76 e9 M) I- R" u77 : p2 W: @! p, R1 Q5 p3 L ~78 / T9 D6 |# w1 }. X* r4 f' L79 & u1 I8 g7 ?& O+ L3 Z80 , a+ Q" ~3 T& ]+ r. a816 a$ q, J5 @( p4 X1 q
82 F# {9 U8 J& u83& h7 r K$ f7 Q2 {1 S
843 L. j J! R; t/ D
85 : H s# |" r- Z4 Z' g% ^! h86 ( z2 l! c- k8 M/ ~" ^$ U875 P& t- E1 H3 N& x g5 H4 M# }
884 A+ z7 v5 ?- y0 u1 }) N
89 . F8 l1 q. M9 F% d1 _2 A9 F$ E$ n90( z; i7 Q! B9 P$ h
91 / ]3 R- }3 n% r" X' s% ?4 e9 M92' ~# c( Z b1 \0 e$ C/ Z
93 ! J" {" f z8 a2 ?+ u" L6 M94 9 t1 F! U. l1 F+ l" g3 N959 U+ _# H" N8 \1 {4 l5 P( ~1 X
967 b1 ^ u# }" o
97 E/ x% ]. |2 E, R% I, O) o98 i K v9 a! J* i2 v- v2 E99$ T. m+ h k j* f1 V$ Z9 E9 b
100 ( j/ I* Y& F9 \+ Q) i101 , S, F) p* K0 k K4 o( O; w/ z102# m& k# b6 e- Z$ ~! N
103 9 H! {2 v: [- C% d x8 [104 7 a# V ]+ z Q$ }1 B1053 `6 {% L( |. ?2 R* ?; O. r
106+ J: R4 A# f3 g6 b" ^
1076 o( g# I. m( y7 B; I
108 , l" i# _& K) f+ r4 b+ M4 J109 0 z0 j0 B2 v. M$ a( \& {0 j110 # ~3 B. L) k a111, p1 _: y- ^( }
112. P9 d* q; z' A
113 2 E3 q9 q6 b9 H6 X114; i* I& I6 f8 X6 e. t' e/ ~
1151 d/ a7 x; a" ?1 I" C$ A; {7 m
116 0 F: i+ l' j3 O# ^. e5 E& U117# A/ G- x- L4 r# `0 ^9 J
118 ( J# `! H* v# n b119 9 @/ {, s* o" Y" [/ T8 k7 S120- s6 J0 S U. _! h& M/ }
121! p$ n5 a, q; g% A& C
1228 M" X5 s4 o) V5 S! `
123 7 e1 |1 _2 n2 I* m124 . i* }5 J4 r8 E+ k0 m125: D; F0 `7 O" v4 v
126 ! v/ h* X/ ?4 O3 [127 / O/ n: i% s! j/ B7 i128 * e S3 V9 w1 V9 |1296 t( [1 b- ^' ~4 Z' G5 L
130 0 a* Q" Q) f3 x1 H' P, u. s131 5 c0 m& E6 {- o% N" k( d# {% V132 9 M% r: |$ f! C& q133 % N: y6 m' |6 _0 X134 * \# m- P6 U6 d; t2 ~1352 e* T+ b8 y! y
136: Y$ V- n1 d2 o, z/ I& i
137' p& v0 {. [4 m4 }) w$ x' Z; {
1387 R8 O2 X" ?8 \% ~6 _6 C
139 : q2 }! D6 Y& ]) y, a8 x5 ]+ L2 _& |) |1405 x) z9 U0 k+ d
141# C; P$ B$ B& Y, q
1425 g, I( K3 k$ z: k& t9 @- C
1434 q1 c) f9 k) h7 L6 M
144 * \8 H5 b1 R6 R/ d8 C# U# K) `, y8 Y145) Y& T/ `9 t/ k9 V* z" f
146( F7 ~- ?" u3 v# r: y: Z7 P
147, e2 r/ y1 V! @& ]# s( P
148$ H" w# Z; h2 g/ ?: Z
149 g! \! D: x+ ^% d* z4 l2 _
150 G5 Y5 X- V: O
151 4 a1 j' h4 N- s; \4 s) J152) z# i. M" \1 U" d
1531 I# u5 ]6 c/ u1 H$ C) r5 G
154 ; r/ a9 s: l" e: S3 `6 I% s155 7 N1 `3 d. R5 ^# t' ^8 Z2 S% H156( w! k7 m D# T
1579 B `, A2 X8 W& R
158 2 R+ `7 W. ?0 {! ~159) K( z" \' y8 U; x
160 4 n0 v1 t7 S2 R. d: x* [ `3 C7 ]) N161 * }3 A0 c7 J! K" t+ ^5 v: W0 c1623 d) ? q0 }; c, C0 L
163& p2 \2 ~- E5 K& y' c, Q; E r5 B
164 6 [/ ?4 B; H. a+ h! r165 8 J& N" d$ \ o' i$ G& `, g166# r4 S# ?5 k: n. n. C/ i5 v
167. @2 K# Z* [* C. t1 u$ ^, I. d
1687 _; b$ R& R" Q" Q; R# t
169 9 \) ~3 g# n) i! T2 ^170 , Z, m! }( \) y9 X171 + p. h# L [" H1 j1722 ]- D6 |% n+ j, ]9 F/ p* M# Z
173+ r6 ~* _+ _1 Z: q2 P' p
174 ) N) f+ W, H) c1750 v }: Y. K4 J; n o/ z
176 & p, C. e- G( u) m) C% `/ O177 * i0 j8 E# ^/ K6 D178 7 V- _7 b* l n- k7 ~179 8 p, h& k7 j: m- U/ o3 U! w- O180 . }# R4 k4 y4 k; T1 T, g+ Q181) z/ @1 L# ?' l; U9 `9 O
182 1 o {6 u1 y9 ^" j; l) R& z183* g( Z9 A$ n- C4 _, ]5 C/ X. B: m
184& F0 S1 e% [' w6 T7 K) p- w" H9 [
185/ f) M" p6 M; o9 w
186$ {8 K* _. u2 J8 o
1874 U; ^+ Y# f9 _
188 ! B: u) \- d, X/ W+ p/ ?5 S) o) j9 e189 $ k2 w) |* n U) G }/ k& c0 }: n190( V+ x# D2 v( o5 c) K* G
191 m, X8 T. l; G1 M! D0 [6 n7 M
1928 E- M1 K: C. E L' X: ?& V; H
193 * f/ z! C7 @, c6 Y$ m ~194+ M# c7 h3 o* S* @7 u$ e- }* M
195. v9 E* z% R' o" e8 s F
196 L1 C" p8 j0 X' a* S* c4 I# W
197- e" M f1 d/ K
198 8 U+ v+ U- d' C1 l& w! Q) K/ k5 Z: e! s199 u. W2 U7 [! `5 s' E) J2004 F5 W, P& C: v. v
201 1 }4 Z& j0 a& V @3 w5 k. m# S202 2 b$ [- L K6 }) Y W# ]203( x3 l) b) b5 ]9 N* ], e% [6 s( W' I
204, R" l7 ^* \3 r% }) K( m! i: H" P
205 $ A+ a3 r" H/ f& h( J* f6 ~! h206 8 K( g( Y# }0 Z( Y3 ?207( j! b* s& @5 ?1 N
208 ; T1 t& _6 j% C% M7 \: F: h209 / x3 o. }2 M1 W+ A! V( ?210 3 i1 O) |3 K0 q& q% i% [* ]$ u211 " p; ?5 V% d7 X. G212 7 l/ g( ^% s) @' K213 ! _) p" `6 L5 U7 z2 D+ z214 ' W# v- z% @+ g/ J/ F H2 K" n7 A& @: {215 : i/ }. P- C& U7 h( ^0 K% ]0 y216 4 E! B' u" `6 I( I217 7 U- i4 S: T: R. A# q6 b% g! n218 - [$ p1 m4 h* F/ q O' j2 l$ \219 ( x, V! J2 t1 C% j3 i2 c' f220 ( @; z: h z8 F) y4 a2212 \; z: J/ q$ M, F1 u0 I
222 ) ^! D% r0 k0 u, \223$ W' y( ~7 m: q( D; J& I) }& Y
224 1 u$ f) Q# e: ?8 E225 4 \: x6 v! L1 Q' D226 $ x8 q" Z8 }% `! O9 N3 M$ U227 ' v: L; J/ e7 H/ Y2284 |, v3 u4 P% @8 e' ?% Z' U; M
229+ N' t( q' }3 H# z# i! U( Y$ q( O
230 # K2 I; s, q5 Z* W231 + e. z& y/ Y r: [& {232 7 D8 a( L* f1 Y; c: B2 \233* y" i8 z2 |, K2 O( z
234 6 o6 Y8 w$ U+ q V2 c$ R" X5 W7 l235 , ]1 Y3 c( ^1 |+ n4 f0 G5 n4 ?6 [236 0 f" O, @7 w7 C2 y237 3 z$ @! s. o' c5 _0 W( Q0 q2387 }# q: q9 t. p- j
2393 `: h# d L9 B
240 0 O! P4 }0 F" @" p+ `& W( g241 + d! ?0 L# t1 E' e5 P242 + r5 y6 o4 D, T. l% V243 ' ~6 B% t# z7 b2443 j" A* t" T! g- p
2452 `+ Y4 l4 F* a/ C' l
246 ) h1 T3 \! R' [1 N) k' Q6 V/ @" E, L247 ; W- ]2 G$ G# f# n4 O248& s% E- o8 ?5 D! [- C
249 . I1 Z; T! P% R; Q* Q250 - w7 D# v# B8 \; T# Q2513 _% p5 ^, Y# s0 \
252 $ K* f2 r7 M# e% K2538 M4 A8 f. i2 B5 G" l& w) J. ^& d
254 9 F. _' T5 H6 A% n3 _255 - k+ M! B6 P6 V7 t( y256 $ B2 ?0 n+ Z, H, o+ a, [257: R( V E e' V! f4 H+ X! |2 i- I
258 6 K- \2 @1 f, q% W" N- E: N6 m3 |259 2 }8 x# Q$ m8 C- k& Q260/ c; Q V& g2 P
261 5 N2 E- g& A S: ~; f3 d- L H% ]& @262 + w" H5 E+ }9 _, W1 Z$ }" Z2 z2636 t+ L2 s/ x6 L$ \; U9 G0 v" k2 W
264 . W* c5 s8 ]9 L$ W2655 Z3 B9 G8 J& ]5 g0 P% y
266% C: j1 M1 l# x5 l+ b* a
267, ]2 S. g, @. H7 ?
268 7 M5 w5 _% {, S: q5 K3 \! C269 n+ C. m9 ^* @0 O2709 B7 }, } V7 i" T8 G. s5 @
271 $ h+ @4 @$ ?! G S272% c1 @! Q" [3 S% G# U3 D
273 0 X, n. H- e% i* G; f) ?% ~274 & \9 b8 ^9 m& _8 A' s" h275 - j. C2 H/ `3 `; P7 S- r) Q9 T276. r. j6 B' z9 A+ _0 U5 T5 ?
277 " B# T% P/ C/ d; r' C8 y0 E* L278$ j& a6 R3 S4 ^1 b# }4 g
279 ' o$ V: t. D; X- H q# ]4 ~ Q( C280 . j9 Z1 T+ i6 [. c* l6 }281 ( G: d$ }( t$ l( p' Z/ U282 & w' P1 ~; k- D% l283% R) j: \% i# T1 \) H+ L- \
284 $ t$ S; U$ v3 o3 h7 N285 & Q, z0 W( L. f286 2 j" A% |4 I f6 _287 9 ` Y# M$ A5 Q' y9 F2882 c P6 ?! F$ s; ~$ X9 [
2891 c" [5 k p$ f% i2 U! p% X6 m
290 " V( M, a4 Z3 C I; q$ a8 b; }2917 g$ t! z- l7 V/ O* [# {* ^8 [$ W
292% B0 n1 u% k5 }
293 5 T! W6 m3 v, P" n1 Z, g$ l8 @2946 i& m7 C) c4 h0 r' [. p: n
295 y+ T+ I) F& r' n296$ O( P) M% B) ~2 U( y
297% f1 @2 p/ H' @. {9 w/ Z' D1 o
2989 C7 B8 w; K# H/ A" x% x n9 S0 T6 H
299. h; B0 x; L# m5 H% R
300 - M* `" v- ]3 Z$ U, R+ n301 8 a, N" @$ F( m' H0 {, U8 t% S302$ {7 g, d) y. Y. j4 {) I. F: Z
303 % C% m+ R7 I( n( V& E304! Q& Q, {* Z# Y4 S% Y/ V
305 {" Z+ c Z6 N% x; W8 C* f, j
3060 y3 T! G* N9 C9 x
307* R1 m' g+ f4 Z3 U$ z. }+ S5 }) G1 r
308 9 m4 w' l3 |0 | O/ N309 7 h0 H# o( e+ A; |% S- Y& r& _) Z, d310* Q) @4 _3 J! g f+ j. K
311 6 z4 V4 E2 f6 F, Y6 k- s312 + P+ m- @! ~! J/ Q: u3 j% R313( H7 J, w* W; o" c
314 . x$ w! Y5 w/ B315" `% A7 X& E# D2 I( [# c
316 # Y7 `: E4 I9 p; _4 E" i5 k317 D3 `7 D% r, W; k2 u9 p' t3189 f9 S% T& M+ S5 K9 e1 z0 x: p
319* n8 x. `( z0 a {4 P6 G
320# I" Z8 [3 c( f) n2 M: z3 d8 F
321' U* f$ j( C2 R3 ?& K+ n, X
322' l7 D# z8 |" M+ a8 }
323 , ^; V; l* ^' A# u324 * o |* z; [* U325 ) [7 p, p S( R7 c3262 e" `4 R$ H0 H! D
327 J) ]: y1 x/ ^- d) r0 O5 g9 \$ y328 6 S! ^5 L/ J$ q7 s329 |. @/ }* W e& `5 x& l# H8 w330 " A' p/ i7 [8 J331 Z4 h; G7 z5 O* H1 u# y: H& A3 L# f: s; C/ l; P7 d
1 }+ d; c% Z! t4 p( ]9 k3 b9 o. ]
* f% E$ D' ^! t3 A( \+ c1 H8 M! b$ a& c1 K0 e
( D: E; I. y) r. A$ ~& e
- t1 M4 Q& ?, \! }
4 `8 Z# H" d( D; x 8 a l i4 i7 {+ m 8 F7 M9 o+ S! i# Q/ _4 E. G) A* H4 C6 j- {) `9 n# l
. A' L- _! @7 N* E) M ; c7 b t8 @0 P$ {$ W# j- N————————————————3 [% c* Y0 S, N1 H3 v
版权声明:本文为CSDN博主「biyezuopin」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。 / J6 T8 T% Y& H* `& i原文链接:https://blog.csdn.net/sheziqiong/article/details/126803242 4 m; O, y7 s" _* v9 O; f8 O2 E% z7 g/ o! q: T. ^& w) G