9 a# r* S2 Q* U4 { \#计算困惑度( H& T S* B3 _
def perplexity(num_topics): ) O% z L- d) x1 S2 W- R, [2 T ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30): u! Y- U8 e0 e% g; q
print(ldamodel.print_topics(num_topics=num_topics, num_words=15)) ) f- o2 x. V- h+ B$ c print(ldamodel.log_perplexity(corpus))' Y6 V& }1 {2 n- J% ^
return ldamodel.log_perplexity(corpus)3 N* w% u o7 S2 j3 Y
#计算coherence/ N' ~) _* d& d2 I! {% N
def coherence(num_topics): , n4 `) F7 E; O: A1 {, d# y ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30,random_state = 1)" D* [( F9 @3 j2 w* ?
print(ldamodel.print_topics(num_topics=num_topics, num_words=10)) $ N9 R. R1 u7 J, T ldacm = CoherenceModel(model=ldamodel, texts=data_set, dictionary=dictionary, coherence='c_v') + x2 t0 h1 q0 h( C5 I0 K* c# u print(ldacm.get_coherence())+ g1 `% ]! ]9 w5 T% g
return ldacm.get_coherence() ' G% \6 A/ h. d$ F$ N4.绘制主题-coherence曲线,选择最佳主题数 & v; E# A) x; r" wx = range(1,15) ' q$ [1 X; _6 K8 `# z = [perplexity(i) for i in x] #如果想用困惑度就选这个$ o: C5 _$ X- Z) ~: [5 s% B) q# |
y = [coherence(i) for i in x]% W0 h# l* z! D; l/ H
plt.plot(x, y) x( W, b: K1 v' vplt.xlabel('主题数目')1 h9 Y8 c7 j0 Z4 O! X
plt.ylabel('coherence大小') 0 F& y2 F- X( |9 b& U' T( F, }4 `% rplt.rcParams['font.sans-serif']=['SimHei'] : i& Z3 Q+ e E; [% H7 d/ @matplotlib.rcParams['axes.unicode_minus']=False u9 _3 C9 M Q+ Y4 E
plt.title('主题-coherence变化情况') . A5 X9 W2 N; X! zplt.show() : N- m" K" x U2 q1 e 最终能得到各主题的词语分布和这样的图形: ) S3 c9 N- a0 G: v0 u, ]& E: Q: L3 k3 w V2 k; u7 D W
4 `( D/ F/ ~# T. d; T
0 O, E* b8 V8 Y- U
5.结果输出与可视化# O, y" B8 s$ E) m @- d' E
通过上述主题评估,我们发现可以选择5作为主题个数,接下来我们可以再跑一次模型,设定主题数为5,并输出每个文档最有可能对应的主题 9 ^; o6 T/ d* P0 [! Z1 n6 y$ P ( s7 B7 _8 i+ f& G- Efrom gensim.models import LdaModel ; L, _- x0 r3 h1 G6 b3 t/ x, Uimport pandas as pd; x6 ^# n6 g+ N/ U& Z. q0 g
from gensim.corpora import Dictionary 5 X5 K& s/ i1 |' q. G; m: ]& O; Vfrom gensim import corpora, models - ?; t/ e$ l" Q1 ]6 Z. oimport csv) K" X) f9 Z( Y7 L$ ]3 d
' k) l5 x# s6 w2 N. U7 w% l
# 准备数据 $ Q ^2 N% {- t' @% n! ^9 e" x: TPATH = "E:/data/output1.csv"8 c6 \. Y; D' q6 i
. h, l4 l( f& [3 H. ]4 L( M4 `% r
file_object2=open(PATH,encoding = 'utf-8',errors = 'ignore').read().split('\n') #一行行的读取内容. B* W3 {$ a% b9 t A$ u
data_set=[] #建立存储分词的列表$ m. B" P8 `& l j8 P. P
for i in range(len(file_object2)):+ s3 O2 x: e2 S( T" A# B M0 I) s
result=[]0 S/ J3 q9 \1 F
seg_list = file_object2.split()# Q# [; n8 C8 S+ [9 L) y I, \
for w in seg_list :#读取每一行分词; C1 S- y |$ t9 V" Y5 X \
result.append(w)6 ]. K! W$ K$ i0 ^
data_set.append(result)/ H" }6 p2 N7 o9 K2 R7 _
. o/ H9 x4 z7 f7 W* Y4 Jdictionary = corpora.Dictionary(data_set) # 构建词典3 K" H4 |. l/ w0 V
corpus = [dictionary.doc2bow(text) for text in data_set]* T2 {- V# S5 Z9 E( P
4 T4 \0 y, r- v; g. J
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes = 30,random_state=1) * t4 R l; b0 l/ y6 m% I0 s2 gtopic_list=lda.print_topics()1 X+ p p `) w& q
print(topic_list)% t" d) v* e$ P) k. m B1 Z
% W8 X5 Y, ^* X) A( d
for i in lda.get_document_topics(corpus)[:]:& X5 b2 X; y/ A" l3 P' R. A! D
listj=[]& _7 e( L% D" L7 I- W2 t, f
for j in i:8 H/ r' }+ e% D8 C4 D: g
listj.append(j[1]) 8 ?. x. ]2 s* R/ E G bz=listj.index(max(listj)) 7 [! f ]6 m: H9 l: M print(i[bz][0])0 n2 k+ l( z% p) |' c8 R" Q) h
! m9 c5 w& S& P0 {& R
同时我们可以用pyLDAvis对LDA模型结果进行可视化:# E% Q" U% ?7 E; d" {( q2 E
) E2 V/ P* P0 B, R j/ c timport pyLDAvis.gensim) \4 H" ?- ~# v2 q' P. b
pyLDAvis.enable_notebook() 2 c5 a k7 f" j: ^% @data = pyLDAvis.gensim.prepare(lda, corpus, dictionary) ( i J y" p$ d) |/ u9 j7 wpyLDAvis.save_html(data, 'E:/data/3topic.html') 7 D* L- e5 W B+ M' y 大概能得到这样的结果:9 T2 `8 t! m3 c0 N
+ c) r2 A% F$ ~6 |4 K( d! I; E
! B7 C w* s) r* [% E 7 u( a' C) _5 `$ v& Z- w6 U 左侧圆圈表示主题,右侧表示各个词语对主题的贡献度。2 l( ]& B8 ^* n8 k4 ?: m
1 ] y1 D3 |: ]% o p* K9 W( C4 s' w5 d
所有代码如下:) x Q: C* E, b' D- _
import gensim8 i8 T0 E+ ?; Z/ b; F' L' N9 G
from gensim import corpora" e& q& F2 R/ { B+ a0 }
import matplotlib.pyplot as plt & ?; e Q. M+ ?import matplotlib( b# P% f& P. d p' ]- Q5 v' R* e4 n
import numpy as np 7 v3 ~" i0 r9 R' E+ kimport warnings: h* J& `0 d2 Y
warnings.filterwarnings('ignore') # To ignore all warnings that arise here to enhance clarity6 g7 g4 |1 }+ f1 `! b+ h: C& b
7 n3 S' k/ Z4 Q9 E" R9 x0 R3 ]from gensim.models.coherencemodel import CoherenceModel1 G! `; g% o. y1 T1 q
from gensim.models.ldamodel import LdaModel9 d- E6 p t- H) d
4 h' d5 m7 A' o2 _$ C' d
+ f) O$ \, }4 p
0 o( q, i! ]+ h% L
# 准备数据/ K% f1 {: D0 `8 w
PATH = "E:/data/output.csv"5 ?; F) K+ C$ h! R$ M, j/ S/ f3 ~5 v
2 V" ]# I. K( U0 w; Hfile_object2=open(PATH,encoding = 'utf-8',errors = 'ignore').read().split('\n') #一行行的读取内容 7 m+ w [5 g4 b& r9 D; Jdata_set=[] #建立存储分词的列表 2 Y: f0 @& f: x2 W. N" U. Ifor i in range(len(file_object2)):, w! e: _ b+ }3 [2 F
result=[]2 h. g# F F) j( V, N5 e+ [1 q
seg_list = file_object2.split()9 q! E; O' Q: t8 D1 N: A
for w in seg_list :#读取每一行分词/ {1 B. M2 D+ U: p o5 G1 I( B
result.append(w)- z' w7 u2 s F
data_set.append(result) ) |, p, B4 E6 v; m6 xprint(data_set), w- r& ?7 z2 A
" G9 s( Z/ f( ]( ^ ) A6 s! e" p7 g* w& \dictionary = corpora.Dictionary(data_set) # 构建 document-term matrix 7 o3 P9 T" a% `' n$ [( tcorpus = [dictionary.doc2bow(text) for text in data_set]7 n1 @: e( K9 c+ F$ E" t h
#Lda = gensim.models.ldamodel.LdaModel # 创建LDA对象: ]% p. K; b: b7 i
+ F2 u- M* T! c1 C6 w
#计算困惑度 4 ]) `# g( r: K6 F0 a' o) J% J4 Wdef perplexity(num_topics):* |& @- V( w- }! f& f
ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30) 0 {3 o ?" b9 P% [ print(ldamodel.print_topics(num_topics=num_topics, num_words=15)) & r2 d- X: X$ b) B' U1 o4 ~ print(ldamodel.log_perplexity(corpus))! D0 J$ P& n3 ~) |7 t
return ldamodel.log_perplexity(corpus)' c( h4 J" a" i' T
4 Q: ?3 z \5 o9 D
#计算coherence . E1 N9 q' D, ^def coherence(num_topics): / j! R7 } x" g0 U- v5 P# k0 o ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30,random_state = 1) , w$ k' g$ g+ c; V5 v" y( G print(ldamodel.print_topics(num_topics=num_topics, num_words=10)) : p* m; W- c- ?/ C1 W ldacm = CoherenceModel(model=ldamodel, texts=data_set, dictionary=dictionary, coherence='c_v') $ c4 d/ D+ H# Z5 E" y; p3 E print(ldacm.get_coherence())) }+ E9 _3 c* }+ \! t! D
return ldacm.get_coherence()8 H% ]1 @5 N( X/ _
! R$ R7 c/ A3 [6 v. }' ]# 绘制困惑度折线图8 g/ B( u! C4 N% w
x = range(1,15)5 A; V* A; Z$ y r& F
# z = [perplexity(i) for i in x]* e6 O+ K$ ~) P: {, P- b
y = [coherence(i) for i in x] & q+ p/ v8 M# e" u2 V7 ?# |. [plt.plot(x, y)) K. ` h0 y2 q2 E7 g
plt.xlabel('主题数目') 1 R# O1 O. w% A5 |1 rplt.ylabel('coherence大小') 9 v0 |5 y- E- l8 N! n: u3 Lplt.rcParams['font.sans-serif']=['SimHei'] U* }1 V7 V# c( r5 l6 k% U/ Gmatplotlib.rcParams['axes.unicode_minus']=False. E7 S. q' z* C1 T) s
plt.title('主题-coherence变化情况')9 C& s1 Z t7 b0 [4 x
plt.show() " @/ B, A. m& C! Y6 z f6 W! i: s! t3 t/ A1 B% [- l& E/ V
from gensim.models import LdaModel 6 h' B9 o% e" G9 u$ D+ Q- {import pandas as pd) f' ~1 F7 t4 X; `! k
from gensim.corpora import Dictionary: A9 U& e$ S1 n* l
from gensim import corpora, models y1 J( H! R7 E k, B ^import csv 7 | q) |, E( H4 t% T7 }. j $ N$ c% Q5 M! _8 A# 准备数据% a7 b0 E; u; M9 a/ i8 ]% p$ Q
PATH = "E:/data/output1.csv" 7 @) h X. I7 M2 M& D5 Z6 g) W2 m7 _' r7 h! H3 w5 Y! v" F
file_object2=open(PATH,encoding = 'utf-8',errors = 'ignore').read().split('\n') #一行行的读取内容6 ?0 E* B1 j3 Q _) R; W
data_set=[] #建立存储分词的列表 ; W+ c. @/ H; D+ kfor i in range(len(file_object2)): / p5 O# z2 j% z result=[]% D! Q: m" U9 E! H/ u4 u
seg_list = file_object2.split() . }3 q: [% @8 h {8 i# i for w in seg_list :#读取每一行分词 , Q; i* u* f. @: c; i5 L4 B4 b result.append(w) 3 O4 y% I( }" J0 b+ ]4 e data_set.append(result)2 f G$ g. g! t' X
! p- Y, M1 m# K/ H
dictionary = corpora.Dictionary(data_set) # 构建 document-term matrix( O: o) l! u2 O/ ?# r
corpus = [dictionary.doc2bow(text) for text in data_set] # I. a& [5 ?# d" \6 X, H/ e: M8 f! s! U7 I5 ~# Y# |9 V
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes = 30,random_state=1): v7 d' p, F: Z7 \
topic_list=lda.print_topics() ! p+ _8 `/ e2 i7 Iprint(topic_list)0 p) E1 S) S6 ~5 s
# S! N3 O( E6 w' d1 u; Q$ presult_list =[] $ i+ w" X4 U2 N! |for i in lda.get_document_topics(corpus)[:]:9 J6 k: F$ D/ ]3 S- o) K6 q
listj=[] ) [0 p# }3 X; g' B9 H for j in i:* g2 p! c0 n' _
listj.append(j[1]) 4 c: i6 @! v! k2 {1 g" d bz=listj.index(max(listj))$ A f0 q% f8 Q/ g
result_list.append(i[bz][0])3 d( ]( V+ u" V! Y6 S4 G. Z! b
print(result_list)" c3 O/ {( j9 `3 L. {! [5 K: S
2 @( [' T, Y5 _* t
import pyLDAvis.gensim3 h4 v: v7 `/ _2 s+ s4 w
pyLDAvis.enable_notebook()# [% K: n+ [3 r4 u3 V0 _
data = pyLDAvis.gensim.prepare(lda, corpus, dictionary) - Y) D7 k X; L1 ?3 d2 N. j% `pyLDAvis.save_html(data, 'E:/data/topic.html')& [' h/ Q7 A w+ z
有需要自取~8 t- v9 N7 g- Z1 T
———————————————— d+ ~( O: E% a* C2 ?, s: {版权声明:本文为CSDN博主「阿丢是丢心心」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。 * X) H/ p [4 q原文链接:https://blog.csdn.net/weixin_41168304/article/details/122389948( x2 V6 W) f) d1 h
) g* H# D6 w6 O, W8 g
6 t) K$ F L5 b( l" i