& v5 ~* t8 a9 T) Jlabels=[labels(1:bestFeature-1) labels(bestFeature+1:length(labels))]; %删除该特征- j1 W9 H$ c( V' h0 C# g u# ~
7 u9 \9 p& T- ?. B
%形成递归,一个特征的按每个类别再往下分 " n& b2 T- u- N; Y0 {+ Gfor i=1:length(uniqueVals)/ a4 o) n9 J2 M; v. v+ }$ q) n
subLabels = labels('; " M4 K8 P9 g; w @* v/ | value = char(uniqueVals(i)); $ f$ {0 z, c9 ?% w. l subdata = splitDataset(dataset,bestFeature,value); %取出该特征值为value的所有样本,并去除该属性5 [' N+ S# ~6 S4 D6 A1 z
leaf(value) = ID3(subdata,subLabels); j2 }( k0 n1 g" K2 K7 P
myTree(char(bestFeatureLabel)) = leaf; . b' _# J' Z" s4 G6 N* [0 B+ pend ; X o& G: @2 o/ G, k" a: j 1 S: S" i3 U: x( q* U+ V% fend % L A& S- T4 \+ x7 w4 I; C9 l* L0 U3 h' F
构建决策树过程中根据基尼指数选择特征的代码: . M1 s) I( a* x; U / F& q( e% i: m& h- c5 k" D, g9 j/ X* Zfunction bestFeature=chooseFeatureGini(dataset,~). e/ d2 g" m% F) c4 v
% 选择基尼指数最小的属性特征 p1 H5 e0 a/ v1 |+ _# c( {4 Q/ k( G; ]1 w, F& u/ e8 v
%数据预处理9 g8 u" S+ U# w! K, E
[N,M]=size(dataset); %样本数量N- ?: h% E- y2 I; x
M=M-1; %特征个数M / k* n, D$ n2 C1 U- j3 yy=strcmp(dataset(:,M+1),dataset(1,M+1)); %标签y(以第一个标签为1); C) h$ f4 k' `+ C4 @' }5 O
x=dataset(:,1:M); %数据x+ j$ j: R( H0 P8 c" _
Gini_index = zeros(1,M); %创建一个数组,用于储存每个特征的信息增益0 B3 @4 W. N$ B0 x2 h$ a
%bestFeature; %最大基尼系数的特征) i7 n; d% q, l9 w, j: ?
+ V( o- M+ `6 v) k6 ?: \* W%计算基尼指数0 B4 x3 a9 s. {9 J; i
for i=1:M 2 t! m/ Z. s0 I; G/ \. Q % 计算第i种属性的基尼指数. W) m$ x, K8 D2 B6 h7 P
temp=tabulate(x(:,i)); + X5 ~3 V! d% T1 H# x1 L value=temp(:,1); %属性值3 _+ @& l9 h v# [8 {( F
count=cell2mat(temp(:,2)); %不同属性值的各自数量8 e" _- k9 }4 {+ e0 a
Kind_Num=length(value); %取值数目 + h* X, E1 r, k7 S$ L Gini=zeros(Kind_Num,1);9 m2 N1 U; S- L( ?1 t; o; z
% i属性下 j取值的基尼指数 2 Z7 K( j3 @% ^' t5 I+ }7 ^ for j=1:Kind_Num( E0 ^, k s3 d+ {4 C5 ]% I
% 在第j种取值下正例的数目 6 g1 @8 B/ a4 ` Gini(j)= getGini( y(strcmp(x(:,i),value(j))) );( I1 q) t- U+ f; [% L9 E
end: B+ N' r: O: g& j( e* F, Y9 E
Gini_index(i)=count'/N*Gini;- N% X9 I6 c$ l7 a
end! x0 u3 Y" H S2 N0 T0 m- V2 I
%随机挑选一个最小值) Z0 M7 G/ x- i* S; x. ]
min_GiniIndex=find(Gini_index==min(Gini_index));$ C {+ k1 a3 p# W f$ c, g0 ]
choose=randi(length(min_GiniIndex)); 0 j7 [( q% p/ r% @( W8 I. EbestFeature=min_GiniIndex(choose); * z& |' H! G( F2 X3 Zend / W* x; y' t, r* K! _ M8 { / D# P! I' G2 q计算基尼指数的代码如下:" d, m: D6 W% H( E$ b
- Z; f3 J* m' N- sfunction Gini = getGini(y)$ N/ Y" Y0 u- |$ x
% 计算基尼系数7 G, u, b3 z0 F% @& |/ k
% y对应的标签,为1或0,对应正例与反例; U+ a+ P6 d( a- \/ v0 A% u
%%%%%%=============================================================================== $ R* N: c* x# e7 b, w* V) q& | N=length(y); %标签长度 + q4 ?5 a+ ~8 \. M! E: S4 T& d P_T=sum(y)/N; %正例概率5 g' _3 t" Z* U' ]! M6 V
P_F=1-P_T; %正例概率/ M, ?7 {& N! a+ d0 ?8 L& c& l7 g7 m
Gini=1-P_T*P_T-P_F*P_F; %基尼系数1 {% X4 d2 ?( m$ [
%%%%%%=============================================================================== @" q3 I5 ? y/ J" T0 D7 b" ]+ lend l/ c/ T3 }% h# l- U' Y
构造决策树的过程中,划分数据集的方法:) O( z& w' @- U- C4 z6 ]/ C
9 P) X' y3 n( [% {& ~1 z4 q
function subDataset = splitDataset(dataset,axis,value) ( R; [# D# F6 I2 P( e2 f% I/ d: u%划分数据集,axis为某特征列, 取出该特征值为value的所有样本,并去除该属性 , Q) Z- N3 E& U- ]* s, j: S E& X& L/ H. K- \9 VsubDataset = {};! ~" [* m: H H r
data_size = size(dataset);- Y0 a a: O* z4 U! c* _# @" K+ y
( x7 Y# B4 E) B7 X4 h
%取 该特征列 该属性 对应的数据集0 D/ L$ B/ U, G' V* `' I( K- k
for i=1:data_size(1); `( h7 A- `. k+ O0 Y
data = dataset(i,;5 S; X' L" B7 V$ o0 J1 S
if strcmp(cellstr(data(axis)),cellstr(value)) , i% a6 V9 G: y1 M. [& l) T3 D subDataset = [subDataset;[data(1:axis-1) data(axis+1:length(data))]]; %取 该特征列 该属性 对应的数据集/ G" f* B a1 x( Z( x2 N6 u! y
end ) @. N+ W6 I: m1 y3 D$ rend * o# v5 k$ F! r" eend + f2 G+ a8 N( [. B% l5 S5 M8 `遍历决策树的方法: : m7 o$ \7 k5 \9 s- V* y5 f4 ~$ S' H9 N
function [nodeids_,nodevalue_,branchvalue_] = print_tree(tree)/ y- c! P7 {" Y: ?* |
% 层序遍历决策树,返回nodeids(节点关系),nodevalue(节点信息),branchvalue(枝干信息)1 H3 Y0 A7 o! r! ?
nodeids(1) = 0;4 t0 r& F' l; Y
nodeid = 0;. ~; j2 k; k ]8 R2 {9 B
nodevalue={};2 e! z* r) F) N% @( v& t
branchvalue={}; Z# r; T7 l, y$ \
: H$ v6 W$ I* }1 `2 E5 Wqueue = {tree} ; %形成队列,一个一个进去 ' b5 L6 V' f& ~while ~isempty(queue) * N' z/ p+ M# }2 H* B1 r1 J/ } node = queue{1}; , S y" U5 a% L( c9 C+ n! q- D4 A queue(1) = []; %在队列中除去该节点& p8 w% ^; k% r& W( V* @) R
if strcmp(cellstr(class(node)),'containers.Map') == 0 %叶节点的话(即走到底了)) m' e* N9 F9 \1 h/ M) e. x
nodeid = nodeid+1;3 b& S& l h5 N: K, t) v
nodevalue = [nodevalue,{node}];- u6 X- o# @' }. N6 W
elseif length(node.keys)==1 %节点的话 ) h- J" G0 i% s; k; R1 q! W nodevalue = [nodevalue,node.keys]; %储存该节点名3 N, h% h, t" H% f2 [# y
node_info = node(char(node.keys)); %储存该节点下的属性对应的map . y' g+ |0 T( L! J/ e, p nodeid = nodeid+1; 6 }+ u! r+ w! f& Z$ X branchvalue = [branchvalue,node_info.keys]; %每个节点下的属性 4 k& {: Y' L& n) P6 A" ~ for i=1:length(node_info.keys) & ]$ X( O' C" p Q* g6 O3 E nodeids = [nodeids,nodeid];0 ]) g# \+ Y+ h, Y' g/ Y
end * r5 s( m1 p; U2 L. q; { end" J( p: H9 |2 B, s) B1 D
( B4 g2 h. ?6 o. [
if strcmp(cellstr(class(node)),'containers.Map') , J0 z2 N& s9 n" _* S0 `' @+ ^ keys = node.keys(); x3 B3 c8 l1 F& `- e/ h
for i = 1:length(keys) % [2 x; Y1 r7 H9 n9 e3 m: x key = keys{i};) p% e: J j% p8 j0 r2 i, O/ M
queue=[queue,{node(key)}]; %队列变成该节点下面的节点, I: @$ O0 r R& k
end ' m3 q$ D4 u% \ end 6 h0 o" J7 m+ ?" Q$ inodeids_=nodeids;6 ?& m2 @2 @5 ?# M6 l
nodevalue_=nodevalue;( n% j2 c( Z) X8 d6 F
branchvalue_ = branchvalue; 5 F7 E( }) p. [. \1 \( Z% Tend3 `' ?( i' ^) G. C3 c1 P
/ I' ^0 ]5 t, J5 L
绘制决策树的方法: 6 O |) K/ {8 U, e$ o( @" L# G - T* }: {) ]% w4 v4 N9 |' d7 k" A2 B& ?& J& _3 c
function tree_plot(p,nodevalue,branchvalue) 7 ?' }+ m4 I6 i/ F3 ]0 s5 s2 N3 J% 参考treeplot+ w8 B7 e% V$ z- M8 }
) D' m6 S. W0 |, l" C P- u ?1 Y[x,y,h] = treelayout(p); %x:横坐标,y:纵坐标;h:树的深度! I, w6 B4 w, d8 Q
f = find(p~=0); %非0节点/ ~) | d8 o+ Y& y1 K
pp = p(f); %非0值1 U. m7 i$ t; N- u4 E6 S) |' Z4 l
X = [x(f); x(pp); NaN(size(f))];; @ V1 J+ q* H7 W$ S. K+ Z
Y = [y(f); y(pp); NaN(size(f))];2 e& ^& z2 e+ Y- K' n& a0 m
4 G; T6 L+ j, s. r; \% X( X2 A6 \X = X(; 9 Q. N3 i3 o8 Q* kY = Y(; ' s9 p4 s- B: m+ R, x5 j : F: j9 v7 ?- d' w* i, Zn = length(p); % G) }( z# V+ p* w8 m+ `if n<500 " z* n* Z# m* y" S! G: Y# ^ hold on; 7 D) [$ h& A# K% r0 g% p plot(x,y,'ro',X,Y,'r-'), s s, W, f9 h
nodesize = length(x);* X9 u' {' w+ Z' p. O' _9 K
for i=1:nodesize- ?$ z n& T0 T+ P# b1 A
text(x(i)+0.01,y(i),nodevalue{1,i}); " W& F9 g0 [2 j( H( ]" P5 {- c end " y" j$ G) ]0 z8 d# C for i=2:nodesize" C4 ^/ y/ C: O% K# v
j = 3*i-5;7 ~% _+ {+ L& Q1 H* k
text((X(j)+X(j+1))/2-length(char(branchvalue{1,i-1}))/200,(Y(j)+Y(j+1))/2,branchvalue{1,i-1})$ F9 [- e' I N# Y9 C8 d
end }- H' J( j; r( t/ e
hold off# m% v0 O- M: Y0 }$ R! Y3 V& J0 K1 \, l
else; C, E# l1 O( f3 T4 \$ D8 _
plot(X,Y,'r-');) X0 W& C8 o$ B+ L# U2 V
end$ g5 p, ?( Q: S- U
xlabel(['height = ' int2str(h)]); 0 V8 O0 M: @6 xaxis([0 1 0 1]); 0 f/ f( j3 A9 W# u6 m" J" Y6 vend; _' ]' Y4 p" X9 D! s- ~
) u7 J7 s2 E- D8 Q
测试集进行预测的方法:7 L% F- B/ @& T) x1 z