# Y! }" I- c/ A. Q$ n Q4 ?""" 7 V1 Q6 Q7 X( J) X% K; I19335286 郑有为: M* c; a2 r$ M- q6 Q2 u: p
人工智能作业 - 实现ID3决策树" p( I- y) @' t, {
"""# r) V1 X& a J) n7 l$ l: S
# e! [* `2 s5 f8 n3 T% Ynonce = 0 # 用来给节点一个全局ID 6 f, O# G7 {5 F% H) \& n3 vcolor_i = 0" T) m: N, e' ^( X% S+ A
# 绘图时节点可选的颜色, 非叶子节点是蓝色的, 叶子节点根据分类被赋予不同的颜色 0 H# m1 j( ]/ Z2 o/ kcolor_set = ["#AAFFDD", "#DDAAFF", "#DDFFAA", "#FFAADD", "#FFDDAA"] 3 ^/ c/ E4 |6 i$ o# Y; A* V, a6 J ^3 _
# 载入汽车数据, 判断顾客要不要买& |7 k$ I9 H, o
class load_car:3 }4 Q0 z/ @& t; r+ I; z# x( v
# 在表格中,最后一列是分类结果- W1 \& { z% \, c
# feature_names: 属性名列表' t! ^8 l# k& N( k
# target_names: 标签(分类)名 3 q9 l0 C7 C; K6 H" ]' g' u # data: 属性数据矩阵, 每行是一个数据, 每个数据是每个属性的对应值的列表 ! B3 `. U# e. }+ | # target: 目标分类值列表7 y% p! ?. E) e; u6 z' v' R
def __init__(self):/ z0 j2 p; X/ z8 \$ P- r% v
df = pd.read_csv('../dataset/car/car_train.csv')- X. K+ w) q/ N. u' X0 X
labels = df.columns.values0 I) C9 |5 z2 X! M
data_array = np.array(df[1:])/ X) }& K2 v6 ~0 P% w$ m( e& X
self.feature_names = labels[0:-1] w9 G W, X2 S/ A z6 w3 j self.target_names = labels[-1] 4 ~0 |+ P, E* Z, p5 I0 l self.data = data_array[0:,0:-1]+ b3 }. a* b$ I* x/ |3 N
self.target = data_array[0:,-1] 7 P, m2 G, N. w7 `( \; N: Y8 ?4 v! y% l% O, y$ U* U
# 载入蘑菇数据, 鉴别蘑菇是否有毒 % Z7 I8 B( i7 R6 b Tclass load_mushroom:% s( ]% \7 ^ c* ^" y. F
# 在表格中, 第一列是分类结果: e 可食用; p 有毒. ' N0 `5 N' R- f9 A/ M6 i# `2 U # feature_names: 属性名列表 0 `/ Y3 ]3 N' y # target_names: 标签(分类)名 4 h$ S' N z" \" A5 r # data: 属性数据矩阵, 每行是一个数据, 每个数据是每个属性的对应值的列表& m/ M( B( R2 `7 x
# target: 目标分类值列表 1 L- j& v, I, b2 O1 Y def __init__(self):0 D- {4 h4 @% [8 e: C! M
df = pd.read_csv('../dataset/mushroom/agaricus-lepiota.data')+ L+ `( F3 e5 s! Q' }" X
data_array = np.array(df) n" P- y2 T: u: @0 N' U' y. F
labels = ["edible/poisonous", "cap-shape", "cap-surface", "cap-color", "bruises", "odor", "gill-attachment",1 ]5 W8 T* o# }# g Z' W. @( l$ b
"gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring",+ m/ z% \4 `5 b7 D( _( \# w/ x; f* P
"stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring",+ J1 q% y" m' G) ~) N# p# V, ~
"veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat"]- v# X9 s+ U% c% s$ t
self.feature_names = labels[1:] % P+ p+ b, F8 }. X self.target_names = labels[0] 5 ^: ?, @; K# q. j8 i! D l0 k self.data = data_array[0:,1:] ) Y; H4 l$ B& `/ V5 f B self.target = data_array[0:,0]3 K4 W$ v8 N8 Y% N
9 @1 u4 u$ ]& `& d! t# 创建一个临时的子数据集, 在划分测试集和训练集时使用 $ F7 I8 ~6 z. Kclass new_dataset: " P3 O: {, [% \7 y8 q # feature_names: 属性名列表' Q. T4 |7 P$ ? T0 Z, d/ {
# target_names: 标签(分类)名 3 u/ B- X, r; p0 ~# s9 X # data: 属性数据矩阵, 每行是一个数据, 每个数据是每个属性的对应值的列表 ; G) l. }6 [& @8 u4 ?% D C% d; l # target: 目标分类值列表 X2 X$ j& E; ], B
def __init__(self, f_n, t_n, d, t):* Y. V. l' U! v/ e4 F c8 o
self.feature_names = f_n 0 l Z1 }' K p% H self.target_names = t_n) ^: ?- B# o% Y+ D+ i3 E
self.data = d' S4 h* y! f1 t8 u/ Y
self.target = t , \# m% H! E" Y 9 v3 H+ U h5 v9 a3 n# 计算熵, 熵的数学公式为: $H(V) = - \sum_{k} P(v_k) \log_2 P(v_k)$% h7 d; H' b- K* Q4 Z3 R" N
# 其中 P(v_k) 是随机变量 V 具有值 V_k 的概率, v% a% H& O+ p9 I, @- u5 i+ i( A
# target: 分类结果的列表, return: 信息熵$ ^: g d* _5 \9 V
def get_h(target): 2 g d$ Y' H" q target_count = {}# d+ m* o5 G1 ~' V
for i in range(len(target)):0 v( a0 F2 R3 Q* r$ a+ t3 x" o! o
label = target 0 o* L% P5 S8 C" N# Z2 t, t$ K if label not in target_count.keys(): ' n1 |; m) T% ~( h+ ^' ?7 u target_count[label] = 1.0( c0 Q4 [: J% S) S! M, ~) s
else:0 f2 Z* w: A- a+ `( l3 `
target_count[label] += 1.0# F( W, i/ a" D/ k$ X
h = 0.0( s& R# r+ w% @ Z- ?
for k in target_count: : Y$ G6 k; g; l9 l p = target_count[k] / len(target) ( g) m! r) ~3 {8 k+ q8 F7 b h -= p * log(p, 2) , H" y( m) S, ]# Y- B& u2 r' s return h+ a* C, M- K: E7 x
, q- s+ U6 _6 _# 取数据子集, 选择条件是原数据集中的属性 feature_name 值是否等于 feature_value3 _7 E9 r9 H, F& x
# 注: 选择后会从数据子集中删去 feature_name 属性对应的一列, b/ ^4 L# X, W& `- B0 S
def get_subset(dataset, feature_name, feature_value):7 b& e+ B& |, G/ Q1 b% U& R9 h5 C/ ~
sub_data = [] 0 B- T* @, u% V3 |- d% I# C! H sub_target = [] 4 f; ^0 f( K7 g, P2 m9 e O* I5 ~ f_index = -1( l# E' N. A7 t/ ?6 @
for i in range(len(dataset.feature_names)): - p/ E: F" c8 V3 m6 P8 ] if dataset.feature_names == feature_name:; {, w3 }9 i9 @) h8 U2 O3 a
f_index = i( L r" m0 U5 a7 D' ` Y
break- V/ x2 I/ [! V! P7 @) k5 x% C
( E$ o$ y6 ~% s$ T' F for i in range(len(dataset.data)): ; y4 j" }0 A3 @5 W if dataset.data[f_index] == feature_value: * K" q( W$ Z( J# W l = list(dataset.data[:f_index]) * V8 y0 T* P, V l.extend(dataset.data[f_index+1:])3 N2 m# _5 u/ y) |8 R
sub_data.append(l) 8 e( ]: m* A4 l* \+ |4 ~ sub_target.append(dataset.target) / f( y0 W' W; I4 ^ 2 v R& v9 ^! r' F6 i sub_feature_names = list(dataset.feature_names[:f_index]) ' x! o- B! c/ o: R" E/ o* l sub_feature_names.extend(dataset.feature_names[f_index+1:]) 0 C' \1 n3 Y0 I return new_dataset(sub_feature_names, dataset.target_names, sub_data, sub_target)5 P- E( C/ s3 H0 ^3 y
5 F1 B$ F+ K: {/ N1 U4 j
# 寻找并返回信息收益最大的属性划分& H; }% m- p4 Y
# 信息收益值划分该数据集前后的熵减& i3 A$ N( V9 g" ^8 A' H4 M7 Q( J
# 计算公式为: Gain(A) = get_h(ori_target) - sum(|sub_target| / |ori_target| * get_h(sub_target))$: i0 K- J8 V' A6 O9 e, w
def best_spilt(dataset):) C: w3 d+ c7 w6 C
. [2 c+ z A2 @
base_h = get_h(dataset.target) % l# {7 W3 ]: z best_gain = 0.0 0 m3 W9 D* \; r/ h& r& t0 F best_feature = None 5 A [5 T8 y/ }( `- ] for i in range(len(dataset.feature_names)): 2 D9 Y, }( B1 Z( e8 h2 z feature_range = []7 o: c. h! q0 [" n/ _. [3 G7 s4 f
for j in range(len(dataset.data)): 7 U, n& p: ~/ w* ]" p8 h- m( u7 w if dataset.data[j] not in feature_range:- \ l/ h+ Y0 u$ U
feature_range.append(dataset.data[j]) 9 |# P5 x2 v; U- ^) M: {; u0 n# l4 ]( U" \
spilt_h = 0.0& Q% d, m6 d" O+ _
for feature_value in feature_range:/ T5 i8 J4 f( D4 _ F; M
subset = get_subset(dataset, dataset.feature_names, feature_value) 3 |* B+ H' @. \+ X0 ~ n( o$ z# e spilt_h += len(subset.target) / len(dataset.target) * get_h(subset.target) & {9 f, n$ k4 B I; Y/ X3 k- h. M7 I# n' T
if best_gain <= base_h - spilt_h: 2 x0 e7 W$ [! Z }7 W% c r; M" Z best_gain = base_h - spilt_h J6 n0 b+ y* e% U1 `6 I) J5 g
best_feature = dataset.feature_names+ ^$ S- b5 E% ^
; p! O2 T; f. {+ U) e4 Q. k9 g" C2 Y return best_feature ! v0 p; s9 Q7 ]# f) g+ p% j5 e8 ~1 S
# 返回数据集中一个数据最可能的标签. G- f; X- W/ i8 p) J5 _
def vote_most(dataset):2 q4 n4 F( R2 h* \! \- z
target_range = {} % T. U) M; X" d& M0 Y best_target = None 9 Q. E' @- J5 D7 a best_vote = 0+ r4 x- [5 W# ?1 i7 @
+ b2 \4 ?! ^+ y1 _4 p T( M+ z for t in dataset.target: ( C3 |4 a ^9 c9 j9 R* w. O+ T if t not in target_range.keys(): 1 o# Q$ V8 _2 k+ h- q$ { target_range[t] = 1 - S' Z' H7 I- N' c else:$ C: u- }# _ A' {+ Y- w8 l/ Z* a
target_range[t] += 10 Z, F1 `! I3 p/ M; I _
4 }- i1 {7 U2 _& ^ for t in target_range.keys(): 5 j# X' v8 [* S! s5 t8 X if target_range[t] > best_vote: 9 V6 n7 B0 i/ R- i! B U9 ? best_vote = target_range[t]- y O( E) I& T' k4 N
best_target = t7 x: b1 q+ F8 x2 U7 m5 d
3 Z% k+ m( I/ I( M/ |
return best_target / p, K1 p9 `) a5 C/ S+ @8 N ~. B4 J5 q' \
# 返回测试的正确率 6 a. Y" j3 |! N# predict_result: 预测标签列表, target_result: 实际标签列表 3 ?3 \" B3 O R3 g0 {; M& cdef accuracy_rate(predict_result, target_result):$ Z; I; g# S6 e$ t/ i" Q$ R& _- M0 w
# print("Predict Result: ", predict_result)" B+ g- L, F2 Y
# print("Target Result: ", target_result) 6 q4 g8 G7 p1 k; i3 c accuracy_score = 0 % E7 J+ Z! z) O* ]( y for i in range(len(predict_result)):/ s+ k& L* x" M5 [* e& G
if predict_result == target_result: ! @- A% }8 p8 `7 E9 s accuracy_score += 1 ) _. p, Y. j( i. y: B0 {% ^$ v0 x/ E return accuracy_score / len(predict_result)6 ^' N6 _: G% l3 l. a
5 x* o/ l h$ t) P3 @. V) o
# 决策树的节点结构8 S% \, t J7 _& e4 W
class dt_node:' e" P5 [4 k% z7 q7 L+ U# W
; w2 x! r" D( P. B# }$ k
def __init__(self, content, is_leaf=False, parent=None): 6 R! j% r6 r# V b. s5 A% Y global nonce 0 W+ ^6 j1 r! Z3 V; N- G self.id = nonce # 为节点赋予一个全局ID, 目的是方便画图6 k8 y2 e+ L, R1 q3 b5 R
nonce += 1 ) w3 A) w$ t. r) p0 [1 M% m self.feature_name = None 1 S6 E) A1 o, {3 v- k# S" Z self.target_value = None# C% ]# j, M; K& T- j }
self.vote_most = None # 记录当前节点最可能的标签+ O- L( D7 b: J2 ?5 {/ T
if not is_leaf: # H7 ~7 P3 W' W0 ~" L* ] self.feature_name = content # 非叶子节点的属性名9 W2 C$ K7 Q4 R2 {* E, |) L
else: + P% h4 o' j) B j+ ]5 U& E. @ self.target_value = content # 叶子节点的标签8 ^1 F6 j' E1 {; o6 S0 A