+ C' s) e) Y4 N( n z 左侧圆圈表示主题,右侧表示各个词语对主题的贡献度。 / O8 S4 P+ X, a) U7 L3 {% h/ q$ @* T) D9 Q$ P5 A
所有代码如下:: e' f6 c. _. R8 l! I
import gensim 2 ]- Z( w: N1 W( a1 x; I: f: ]from gensim import corpora& z1 R6 E5 H4 T4 W: W8 F' b
import matplotlib.pyplot as plt4 _/ R- z2 R( r- L3 X
import matplotlib" n1 w; T, r o$ X
import numpy as np/ L. M6 v H" }; V
import warnings) _ q: h: Q% R& u
warnings.filterwarnings('ignore') # To ignore all warnings that arise here to enhance clarity' a" D! b- y+ n* z( O! p7 ]' e
9 j6 t' _; J Q% N8 @+ Afrom gensim.models.coherencemodel import CoherenceModel ; w! l8 }! y8 p' Bfrom gensim.models.ldamodel import LdaModel ! Q Y3 p) ?9 o4 Q1 ^! ]6 f 8 Y) Q9 N1 r& e6 U/ x& y ) b, v# T/ V8 B4 f, O! ^7 ]0 x+ X: i5 }( y, [
# 准备数据 7 Q$ X3 K3 w: ZPATH = "E:/data/output.csv" 2 W& h' @. L- n0 u+ }0 q / ?8 N" N0 z7 N, h/ E: efile_object2=open(PATH,encoding = 'utf-8',errors = 'ignore').read().split('\n') #一行行的读取内容 $ n& [3 F: x8 W' |9 mdata_set=[] #建立存储分词的列表 1 E% B' \4 I$ \2 a0 Hfor i in range(len(file_object2)):: }7 B3 ~* q* l% H# j e- e/ l& N
result=[] - j6 O* w1 N) l3 n seg_list = file_object2.split() , a3 j5 r6 p5 K. V% _9 K3 F for w in seg_list :#读取每一行分词 ) A( ^" ~5 k! U$ { @/ [ result.append(w)8 `; h D0 Y. p% { ^- R' s2 J
data_set.append(result)3 X0 D' f* D! C4 O& i$ r: V
print(data_set). g2 R; l& A% ]
$ B7 J' S& Z; f% |, R- c 5 Q/ p" L I8 ?4 O; W) tdictionary = corpora.Dictionary(data_set) # 构建 document-term matrix- v) ~3 V9 U3 k# W% u
corpus = [dictionary.doc2bow(text) for text in data_set]) d% p7 V% U( E$ ]
#Lda = gensim.models.ldamodel.LdaModel # 创建LDA对象: i; I5 o# [, j5 B! Z9 J/ q
+ [% f5 G5 r/ c2 p
#计算困惑度& g0 F) L. ]8 `2 p
def perplexity(num_topics):7 S& G6 _; [' w! [3 {
ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30) / \+ l. N) n4 D print(ldamodel.print_topics(num_topics=num_topics, num_words=15)) 9 f2 |% A# { J! F- P+ f' k: a print(ldamodel.log_perplexity(corpus)) 1 s+ u# P4 r* R7 l- L* D6 [& [ return ldamodel.log_perplexity(corpus) % J. ]4 Z; B5 p+ J' n& B/ X% | # U$ {* `6 e9 j, T#计算coherence 3 W' J: E5 F; x% G1 [6 T, Ydef coherence(num_topics):0 @/ c" u ^0 l9 n) X# ?; c K9 \& ?) ~
ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30,random_state = 1) 2 b- k6 `' N% N% P( ~2 K: ~2 v print(ldamodel.print_topics(num_topics=num_topics, num_words=10)); N& ]1 h8 M) i# C3 N
ldacm = CoherenceModel(model=ldamodel, texts=data_set, dictionary=dictionary, coherence='c_v')8 W8 h7 {3 d6 G- ~' Q+ e
print(ldacm.get_coherence())) m% a* l7 V R# n% g' V* a; F
return ldacm.get_coherence()4 Z" x. W8 i! b0 p
, Y* ^8 p1 l4 u7 Z; W7 x' V# 绘制困惑度折线图 - n0 Z! m; I, Lx = range(1,15) 6 A. A3 X3 \1 Q, { m# z = [perplexity(i) for i in x] " _6 ^8 ]1 E( ]y = [coherence(i) for i in x]: p, u% `$ E5 P z
plt.plot(x, y)6 N9 R/ q9 G$ d
plt.xlabel('主题数目') 8 `4 y, O7 W) ^1 \8 ^plt.ylabel('coherence大小')) v8 X% L. P+ {3 n# j, x: l
plt.rcParams['font.sans-serif']=['SimHei']+ \6 R& ?5 j% U9 x$ I8 d8 X
matplotlib.rcParams['axes.unicode_minus']=False X, E+ I# k8 t% r0 h7 r6 T, ?plt.title('主题-coherence变化情况') 3 b$ S: P) s8 `, Bplt.show() 7 P" R# N+ u5 I w: e& d) ] . @1 L2 k; {: ^6 j1 Afrom gensim.models import LdaModel & w9 S N1 f s, Aimport pandas as pd. w2 q x, G* Y
from gensim.corpora import Dictionary ! c& o! t% C: E& @from gensim import corpora, models4 S: l" U, F* c8 N6 q. c
import csv 0 z4 Q+ y9 h* L. j7 A X: k$ B% N. v# q2 ~4 W1 Z7 ^# 准备数据 % T! e- a7 H2 [PATH = "E:/data/output1.csv" . d0 j0 ?" }) ? ( S+ y7 Z D$ Kfile_object2=open(PATH,encoding = 'utf-8',errors = 'ignore').read().split('\n') #一行行的读取内容 , i! N9 `/ n/ ~) {data_set=[] #建立存储分词的列表8 Y$ b4 H9 v* K
for i in range(len(file_object2)): $ ]' M& A- r) z& y result=[]% k, T. W) e# ?6 F4 E
seg_list = file_object2.split() # l9 ]0 h$ [$ t! d2 H for w in seg_list :#读取每一行分词, }( w& x/ B, ~
result.append(w) # m- \" \$ R0 g5 i5 g data_set.append(result) , ^/ C) t5 X9 |4 ^0 i6 p- O' p1 X7 r2 F7 C" n
dictionary = corpora.Dictionary(data_set) # 构建 document-term matrix : _; s, } x# ^2 Y# G4 {* q, `4 Q hcorpus = [dictionary.doc2bow(text) for text in data_set]( [! {) i" G) h, u2 Y, h# m: g
5 _+ X2 @ r% s) f8 m
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes = 30,random_state=1)' h0 H9 I2 X7 d8 G1 q) i( F
topic_list=lda.print_topics()- a/ s1 s3 K) z$ u' D
print(topic_list)5 P$ B' U- n2 F2 ^
" J* j1 X m' g8 K8 k
result_list =[]) S6 w2 W& w. I. W( j& w& ]/ u+ Z
for i in lda.get_document_topics(corpus)[:]:% g+ o$ F" w' E( j
listj=[]4 k9 \; Z0 N& o
for j in i: 6 H( m. J! ?4 B8 F- O! Y8 y9 q listj.append(j[1]) $ A+ Z& O' d$ _! u1 F: L bz=listj.index(max(listj)) ' y7 U0 F W; u% q5 E3 R result_list.append(i[bz][0]) 3 C+ B( Z9 X! Y9 O% E% Z! b7 W' yprint(result_list) " a N8 R) N$ J % Q- p2 U8 a* @$ F7 m6 I1 gimport pyLDAvis.gensim% L8 J4 _ B1 C) m, S6 p$ h
pyLDAvis.enable_notebook() 5 U, f6 C: a& M1 o8 Tdata = pyLDAvis.gensim.prepare(lda, corpus, dictionary)! U- D! j6 o- W! n8 G I
pyLDAvis.save_html(data, 'E:/data/topic.html') $ q* z# O0 o, Y5 q有需要自取~ 0 \' D* {# H }( r———————————————— 9 @& ~/ Y( q8 P: R! T5 ~3 c' A版权声明:本文为CSDN博主「阿丢是丢心心」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。 4 b2 f6 d! S/ b: L |原文链接:https://blog.csdn.net/weixin_41168304/article/details/122389948 4 s6 G/ d4 K! Q( Z7 k 6 A7 l3 I" }- t8 R" n f % U- G) ?9 O* A: N8 x6 ?) d