7 n! O! Z7 W- _# 载入蘑菇数据, 鉴别蘑菇是否有毒 3 [0 k+ n( S8 j8 x9 O+ j& dclass load_mushroom:7 R9 Y3 v+ G1 c/ F) }" R- U" i
# 在表格中, 第一列是分类结果: e 可食用; p 有毒. % z* D2 e9 n. V: v, y3 \ }3 J # feature_names: 属性名列表. {( E7 g3 E% D2 {* ?9 s
# target_names: 标签(分类)名 ! r( C" e+ H* b) `# B% d, R Q4 q # data: 属性数据矩阵, 每行是一个数据, 每个数据是每个属性的对应值的列表 `. [) H. z* X, f" i# p! |
# target: 目标分类值列表6 ?- `! w3 l L' q3 o+ n m
def __init__(self):8 | \+ q5 k& ^0 ^0 j6 ?
df = pd.read_csv('../dataset/mushroom/agaricus-lepiota.data') : A3 Z: t& A) E5 i! @2 b, X7 ~ data_array = np.array(df) 6 z7 U, b k% c" o: n3 y labels = ["edible/poisonous", "cap-shape", "cap-surface", "cap-color", "bruises", "odor", "gill-attachment",. m3 L* c6 P$ B% p, H& z, t
"gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring",# b( b7 W; K. {0 V. \3 G0 t
"stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring",- ~7 ]0 D) ]- `
"veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat"] 1 l9 X0 \3 S5 `% x4 d7 t7 A* g self.feature_names = labels[1:] 8 k2 ~3 G4 G/ t* [, Y$ R' M self.target_names = labels[0]$ F: W$ @" c5 U
self.data = data_array[0:,1:]$ ?3 V6 o! x; g* G( P8 ^" V. {$ z
self.target = data_array[0:,0] $ ?1 W; t T0 B }1 N1 w 5 h K) ]6 ]8 f2 e; L" O# 创建一个临时的子数据集, 在划分测试集和训练集时使用/ E0 @2 M: i0 U- f9 w( E( E
class new_dataset: 7 L& T7 Z+ e J! y' v3 H* B # feature_names: 属性名列表 6 n+ [: h0 o- x # target_names: 标签(分类)名 : {; S' C) y; B" A# o& F1 E # data: 属性数据矩阵, 每行是一个数据, 每个数据是每个属性的对应值的列表# i2 c/ ]$ u3 y2 K5 K
# target: 目标分类值列表9 \" U( B: [! o: N
def __init__(self, f_n, t_n, d, t): ) k: f8 r3 u0 o' Y* Y. \. C1 u8 ] self.feature_names = f_n* D6 f4 [ h1 @- ^, C9 T; T
self.target_names = t_n& u" J) c+ p2 F( P5 N5 S4 z1 p( c
self.data = d ; H0 C H5 C& f self.target = t 1 M5 Y% k, h- L6 W! J! l+ Y. u3 R0 q6 {, A6 b& P8 [) P
# 计算熵, 熵的数学公式为: $H(V) = - \sum_{k} P(v_k) \log_2 P(v_k)$ ! y/ d9 P, K. x1 D9 g; @& s0 u# 其中 P(v_k) 是随机变量 V 具有值 V_k 的概率 ! y9 q5 T7 R, @4 Z0 r/ o) q# target: 分类结果的列表, return: 信息熵 + G0 O o; S2 P+ P7 Ddef get_h(target): 3 H% |; a' s1 Y- c. |, A target_count = {}8 [$ N% e( I+ U6 b% v' P
for i in range(len(target)): : K# z1 ?! u; j- q1 @* O; ~# ] label = target ; C& C% U j; _1 W" p( e if label not in target_count.keys():+ v/ ?* L; I! U6 q
target_count[label] = 1.04 E; q) Q( O. J4 C
else:9 l8 W, e) W6 D4 u# q. k4 T
target_count[label] += 1.0 # Y, _' o Q0 \/ ~ h = 0.0( r: r5 u* j9 G) k
for k in target_count: " R' D+ |7 I3 `( W' B/ K% s# z3 N p = target_count[k] / len(target) ) ^, j2 `* t% O( |) w9 I4 \: G% ? h -= p * log(p, 2) , M9 Y/ n( X7 C7 P$ ^0 R2 A" a V return h ]! S2 f$ D. R% z% s5 Z
0 O. c. [3 [0 d2 B3 _
# 取数据子集, 选择条件是原数据集中的属性 feature_name 值是否等于 feature_value! j4 T4 { S L4 ~, E5 I
# 注: 选择后会从数据子集中删去 feature_name 属性对应的一列 " ^5 e* z' f% z/ i9 [9 \6 ]def get_subset(dataset, feature_name, feature_value): 3 B* H$ K- u$ z' S sub_data = [] 8 A0 V, C5 z7 H& y! G6 L sub_target = []1 T+ G) W5 b5 [" F/ z2 i0 r; e; U. L
f_index = -17 @8 S1 S& b& R' G. g: C
for i in range(len(dataset.feature_names)):1 z7 V- m" ]4 }; c; c
if dataset.feature_names == feature_name:3 y7 z. j- n8 H5 t. c/ D+ j
f_index = i; y: z6 ^; n9 L
break0 u9 @% `9 B5 M* O# D% X
7 `$ q+ p8 }* i9 q/ v. ?
for i in range(len(dataset.data)):* P" `1 g( `* A) x" g
if dataset.data[f_index] == feature_value: / f! r8 ~% Y3 }, V6 ^- r l = list(dataset.data[:f_index])( `( ^) G* D2 L6 T, h( J
l.extend(dataset.data[f_index+1:]) & E% q/ t6 E6 ?; W5 x sub_data.append(l) + _! ~) }. @! Y. V6 w sub_target.append(dataset.target) ' `& B0 z% o3 V: u+ p / n# H* t% H0 _9 B0 n sub_feature_names = list(dataset.feature_names[:f_index])7 \! g' D3 _! n- O
sub_feature_names.extend(dataset.feature_names[f_index+1:]); `) ~- q5 x% H \6 x) n( W
return new_dataset(sub_feature_names, dataset.target_names, sub_data, sub_target)5 l( |3 p {: D$ [
. y3 E# h) Q5 `. S$ y# 寻找并返回信息收益最大的属性划分 ' T0 W! ]. ^ V' r# F# 信息收益值划分该数据集前后的熵减( S# l9 H( ?5 t& v, ?( D2 x
# 计算公式为: Gain(A) = get_h(ori_target) - sum(|sub_target| / |ori_target| * get_h(sub_target))$ & @% h8 C; N) {; Y* rdef best_spilt(dataset): / o5 ^+ j; i) ]: ?/ `0 A& T6 F' \ # L2 c" u# ^" Y! G7 H( { base_h = get_h(dataset.target)6 Z" j. Z9 U' o7 p
best_gain = 0.0 # {+ v8 g- F: } best_feature = None' e" _4 P" u- P0 W. k4 x
for i in range(len(dataset.feature_names)): ; |: ~6 F$ W) O9 O% Y' X( n7 v, v1 B, b feature_range = []" m9 @+ u: p8 Y
for j in range(len(dataset.data)):; d4 @1 g! {5 C$ J
if dataset.data[j] not in feature_range:3 d& J' c6 Q/ g6 Z7 [- p' A
feature_range.append(dataset.data[j])$ [ Z q' d- Z- B& N# q& W
$ X d% k: i: {4 A% L r$ ^ spilt_h = 0.0) t2 z+ X/ ? C! @
for feature_value in feature_range: 0 u, F, n7 s: @3 j subset = get_subset(dataset, dataset.feature_names, feature_value)0 J, ]; D1 |- m/ G7 }
spilt_h += len(subset.target) / len(dataset.target) * get_h(subset.target) O+ l- a- f, a. x8 L; V+ D C 1 A5 e5 Z% y2 V6 @5 g4 r; g if best_gain <= base_h - spilt_h: 8 J9 W% f9 K: w% |5 h+ U best_gain = base_h - spilt_h 5 g( V! N6 a/ U# {" ]; c) T8 X; m; `# i best_feature = dataset.feature_names; [% O" _2 P+ x; k
, t9 n5 ?7 e5 w R: a1 \8 X return best_feature3 b* o; E# ^1 E% ?: W
! r1 o. k( R" @1 i$ G
# 返回数据集中一个数据最可能的标签 % q, o) f; T) y1 Q/ `* ?7 b: a$ h8 Adef vote_most(dataset):" C( x8 T) R) W8 i8 L2 T2 p5 R& N
target_range = {} V' b$ j1 M! h( _
best_target = None$ m6 u2 S. I2 s' @
best_vote = 06 R" B% n$ [- T0 B9 G& r. F- a2 ?* y
n/ O6 i5 v9 b$ A5 U0 l# b) S
for t in dataset.target: 4 ~4 T" i5 ^; ]# R/ s2 ] if t not in target_range.keys():; X5 g9 D% x! Q* u3 }
target_range[t] = 1* }+ N. R6 S' L" B
else: ; G- Y3 J3 D/ K) @# A6 U* i target_range[t] += 1# c0 A' b8 R3 m5 I9 o
( F' b* d* ]( S) F
for t in target_range.keys(): 5 ^# p9 i- A. h6 ]1 i if target_range[t] > best_vote:8 Z6 a/ L8 i6 b
best_vote = target_range[t] % a- Q! [8 P3 S1 s" p best_target = t5 {+ ?& p/ M: P1 v2 c
# n, c# Y) d- V3 X% l u6 [
return best_target- Q% @4 _; n9 x
( }8 D: l+ ~# C( {2 T+ u8 b
# 返回测试的正确率 ' F' f& b) C& P5 B# predict_result: 预测标签列表, target_result: 实际标签列表 ' L4 ]7 s: L, c( F; P) cdef accuracy_rate(predict_result, target_result): % M3 m/ i/ ]2 U; @' p- f: f # print("Predict Result: ", predict_result) ) Y( m8 h4 y2 O! j5 x3 |+ a # print("Target Result: ", target_result)2 m9 k* e9 f4 |. `4 E
accuracy_score = 0' j5 V3 R2 o6 I& T* `
for i in range(len(predict_result)):# Y- Z! A @( J5 S7 [
if predict_result == target_result:1 j4 q8 X6 i) ]* N
accuracy_score += 15 x3 f+ b1 Z+ U2 ]9 N' f0 F% R
return accuracy_score / len(predict_result) 2 r/ R3 s( c2 _2 D$ i - |: s/ A; V O# 决策树的节点结构 1 K" D S5 M% lclass dt_node:! H6 A6 g l+ _! B5 K3 j) g
2 C- r$ o" c8 P: ]" E9 R, @
def __init__(self, content, is_leaf=False, parent=None):0 b- q, D( t, Y6 `* s
global nonce! m* S. L8 w8 p* O2 I- c
self.id = nonce # 为节点赋予一个全局ID, 目的是方便画图 9 f* n& m, ]3 Z* y# Y- L ]' ? nonce += 14 p# A$ Z% k. r
self.feature_name = None d! l& |+ w U1 v' i self.target_value = None $ e# |" \1 d9 k1 A8 B, P: [ self.vote_most = None # 记录当前节点最可能的标签 * d; }# N6 U. I if not is_leaf: 4 |* D% G; k" I1 g! W, ^9 b self.feature_name = content # 非叶子节点的属性名 ! k9 B1 j" X/ W' w else:% x9 B% d: t" h5 j3 `2 S, z
self.target_value = content # 叶子节点的标签. {, p" Y; b, `1 [: O
5 L9 j, \5 ]. j. I" l self.parent = parent$ _' j7 d$ u, ^. h
self.child = {} # 以当前节点的属性对应的属性值作为键值6 } L1 P/ q) d
0 p) \6 A. T$ h$ b4 U/ ~
# 决策树模型+ g% T2 T% d2 f, f
class dt_tree:! y/ [4 y, I) q$ ~" n2 A
6 q+ m- \) c' C def __init__(self): $ T, n6 W4 T" U5 C J self.tree = None # 决策树的根节点 0 h, |6 G6 J" i) w( T: J self.map_str = """ 9 k' g, H$ I& i. J digraph demo{ 3 m( w; w; y( z. } node [shape=box, style="rounded", color="black", fontname="Microsoft YaHei"]; * @3 a- o9 s" _$ Z1 y edge [fontname="Microsoft YaHei"]; ! D5 i0 l& S& G; r% h, ? """ # 用于作图: pydotplus 格式的树图生成代码结构 y, a9 Z N; Y. Y self.color_dir = {} # 用于作图: 叶子节点可选颜色, 以标签值为键值 . U" b+ q2 z( B0 R k/ n , a" W! M9 x/ b6 m1 J5 t5 b # 训练模型, train_set: 训练集+ d, f7 P: q9 p* Z* U" o
def fit(self, train_set): ' Q Y& X! b# }0 A w0 p1 K 5 J7 P% C F' a; u$ O) _7 b% r' k2 M if len(train_set.target) <= 0: # 如果测试集数据为空, 则返回空节点, 结束递归! L& X. [+ {9 l4 f
return None # o; w2 O4 f$ ~4 N; m( N& k0 O. _( G* y8 x
target_all_same = True4 A. X6 U6 R- k7 d6 y
for i in train_set.target:2 u% [, a# V/ z! H2 x% p$ z: d
if i != train_set.target[0]: # e9 w& i- w9 D, B target_all_same = False: f# `* R; P* }. I
break& X! h' c* f. U4 N) e! D2 a
y# U/ q+ X. c' _. s if target_all_same: # 如果测试集数据中所有数据的标签相同, 则构造叶子节点, 结束递归* R' C; y. M+ T9 |; u
node = dt_node(train_set.target[0], is_leaf=True)4 J$ r7 A- ?; N( M. i/ n. ~, n. A
if self.tree == None: # 如果根节点为空,则让该节点成为根节点1 I5 [* e: i( T4 t9 F3 K4 X
self.tree = node & e. o4 ~( H. m O% f! e, f1 M' M6 T8 `' A- p4 H. P( f4 D # 用于作图, 更新 map_str 内容, 为树图增加一个内容为标签值的叶子节点 9 h( M* d4 H8 o- [. @5 U node_content = "标签:" + str(node.target_value) 4 e) ^9 H, ?+ }0 E5 x( _4 M' ] self.map_str += "id" + str(node.id) + "[label=\"" + node_content + "\", fillcolor=\"" + self.color_dir[node.target_value] + "\", style=filled]\n"6 E r" c8 V" b* t
7 e+ i) C2 D) b$ B6 A' ?( e return node0 c) w' i% S% s) y) X$ S
elif len(train_set.feature_names) == 0: # 如果测试集待考虑属性为空, 则构造叶子节点, 结束递归# ~7 K" d+ ?1 L$ |
node = dt_node(vote_most(train_set), is_leaf=True) # 这里让叶子结点的标签为概率上最可能的标签 4 u* O4 H$ H. J0 N, C! q h3 q$ @ if self.tree == None: # 如果根节点为空,则让该节点成为根节点9 w/ C$ K+ l* N; Q
self.color_dir[vote_most(train_set)] = color_set[0] & E. T6 N3 f! S n* A a self.tree = node* D! a6 E& ~, |
4 `8 t2 }- G e. g/ W # 用于作图, 更新 map_str 内容, 为树图增加一个内容为标签值的叶子节点 6 e! Q# S# g: w4 v% ~$ H node_content = "标签:" + str(node.target_value); v6 Q# k8 M- ^3 G! Y; H( F4 M
self.map_str += "id" + str(node.id) + "[label=\"" + node_content + "\", fillcolor=\"" + self.color_dir[node.target_value] + "\", style=filled]\n"* F4 {% K# x, x( A% U% ?: ]
3 s; U2 |* L3 ]2 G) q( F! v2 h: R
return node9 ? j' x5 W& D* ]# d$ ]
else: # 普通情况, 构建一个内容为属性的非叶子节点: v) h6 O- O$ M6 x' D
best_feature = best_spilt(train_set) # 寻找最优划分属性, 作为该结点的值: f) x+ U* r$ n4 m
best_feature_index = -1 1 Y9 ^' G) i! v, ^& u4 b( \ for i in range(len(train_set.feature_names)): ( c3 C( g# _* s1 G9 D+ ]+ ~ if train_set.feature_names == best_feature: " H6 c! B: V5 x+ x# N% y best_feature_index = i $ q5 f3 P' l$ j break 8 g: ]" `1 y; g6 u! l0 a. ? r, u2 a- M' }+ u. X/ ]2 t
node = dt_node(best_feature)5 I2 o Z0 `9 S$ D2 g
node.vote_most = vote_most(train_set) 6 t0 p$ n/ r4 g* h6 [0 L" M if self.tree == None: # 如果根节点为空,则让该节点成为根节点# e& {4 Z9 j7 _: n" y! Z7 B. m
self.tree = node # e1 @; f0 u- e0 \: j! } # 用于作图, 初始化叶子节点可选颜色; n4 ~- K& n1 q
for i in range(len(train_set.target)): + ~8 n3 `+ u7 } if train_set.target not in self.color_dir: 3 ] U4 L: A v global color_i # }" m/ }, P4 u+ _9 i7 o A self.color_dir[train_set.target] = color_set[color_i]% B9 a/ }- k8 E% v7 y5 P- ~5 p4 Q
color_i += 1( Q) S! q6 L" P# C
color_i %= len(color_set)4 n0 u% r' h# X7 Y
. b: `% z* o+ N. K( [" J- W feature_range = [] # 获取该属性出现在数据集中的可选属性值 ( s. ^" H5 k2 R. i2 ?, l for t in train_set.data:8 _8 w D" Y6 U* T
if t[best_feature_index] not in feature_range:; K7 N6 r2 { j/ {
feature_range.append(t[best_feature_index])1 M4 ?* ^ ?+ a9 F/ c1 a