- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python4 Q$ g' l* O( z
- # -*- coding: utf-8 -*-! z' ` q+ F! J) y6 H6 y\" ^. D7 F
- V4 c\" L. d5 [8 b1 l! {% K% x' H
- import requests
- & p6 G& k) m9 q# h% D- t0 Gfrom lxml import etree8 T; X' A$ K- T- B# e7 t
- import sqlite31 B3 \$ @( h& D8 y {9 K
- ; ^+ L; ]1 Q$ S# B h\" v\" V
- 8 ?2 h% }3 g3 S3 rdef write_sql(c, text):+ v! ?$ {: E* L% D( l
- html = etree.HTML(text)
- 3 q$ i1 s0 e/ ? # 标题
- 4 y. e\" Q7 }' U) l titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()'): W5 D z8 m( c% m* j
- # 链接
- % p\" |- ~# s1 C% {, m/ s4 L\" Q! { hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')
- / W\" B' v, x& H- e( n, P& o M # 日期* r$ v0 ?+ q% }6 P5 y) M
- ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')7 Z2 l9 b5 t\" b6 b7 E4 `0 r
- ( A% s/ T/ ?& Q$ L. m- o number = 0
- . }: _5 D# b' d for title, href, em in zip(titles, hrefs, ems):- V& B. O1 `, W& h
- href = host + href
- 8 T- X8 T: {( I0 Q( c( K$ H cursor = c.execute(9 Z' A8 N. O6 j O& S9 \: r1 V
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)* A3 A' Y4 }5 r( Y$ O
- res = c.fetchall()
- 9 }& ?# n- v0 P4 X- U9 C # 判断该字段是否已存在
- ; f# O- r! T. [: u if res[0][0] > 0:
- : J3 n% k) d; W5 J5 ]\" g3 K1 N continue9 J1 L: _9 Z( q$ x! Y
- , b# ~) T. I# a( I4 v
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (5 s5 m) z0 p$ n# D1 { t
- href,
- - x: j- j( B$ w, S) F( Z# v7 t title.replace("\"", "\"\""),/ `% D/ x1 x, V\" s% D' N$ q
- em))\" \( Y4 a\" `! Y
- number += 10 {. H/ m8 R& U9 w! @ X0 A
- print(title, href, em)- Z7 L) F0 }$ F e, I5 D. L7 T* c
- 3 R3 J+ p. O) T& p
- conn.commit()) }# D9 k/ K( |. e- ]
- return number > 0\" V% Q7 @% K' j. Q9 o8 F
- 9 e* Q- y! O# l4 A; h. w3 L9 B
- - K( f- f% V7 ^; j* G3 |+ Y% a% t
- if __name__ == '__main__':
- ) _( p( p; Z7 H+ x0 f: P2 E4 x; h) I5 R- v2 x7 C
- conn = sqlite3.connect("Python-xxx.db")1 w6 Z% u+ n5 k
- c = conn.cursor(), `7 z B$ j4 q* [ K3 ]
- c.execute('''CREATE TABLE IF NOT EXISTS Python (6 b# t2 q/ Y- C
- Url VARCHAR,) y1 G! P6 a3 }* f# n
- Title VARCHAR,
- 3 Z l5 P: m7 R- b, w Author VARCHAR
- ) V R% I\" r- z5 r )''')
- 4 L) j( Z- t, m conn.commit()
- 0 b2 A9 o7 s; T2 _; q( E( l- g! P( I\" e& W& R }
- host = "https://xxx"
- 4 a# U9 f8 C, z5 A7 i& M url = host + "/xxx"$ O. V- A) G+ G+ @) x5 P. `$ {) P
- req = requests.get(url)5 L\" k' q) H' w/ Y+ K3 c
- req.encoding = 'utf-8'
- / H! T. L ?' D # print(req.text)+ I: L9 X B( X1 w5 l! F3 ]
- 5 C v% S7 J& j* f1 P5 `' g4 o html = etree.HTML(req.text)) O: y9 P8 t. M; l) P
- clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')
- + r1 Q. V. P# F8 c hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')( a W* C& E! t$ u% Q7 o, z* z
- # print(clearfix, href)
- $ Y0 n3 A/ J& N! L8 p+ i2 {6 t. e8 _) _5 N& S5 Z! t1 [. K$ x
- for clearfix, href in zip(clearfixs, hrefs):; l$ F7 g! B$ ~ @1 {8 T& M: d
- print(clearfix, host + href)
- 9 y8 @4 X$ W5 F& k& u3 X3 ^( G
- / i& r& _' t1 T2 r. K; w4 s# q6 s page = 1/ F& v# p5 x' Y- k. v' O5 n7 |
- while True:
- 7 z* s% @ p& U- V+ ^, C; o url = host + href + "/list_%s.html" % page
- $ K3 c2 l/ Y1 p9 u4 g1 [2 R req = requests.get(url)
- 3 u3 b% [5 R; O$ n$ T! ~3 } req.encoding = 'utf-8'
- $ X# l4 M* J- o- r9 M8 m$ }9 Y$ m O# w/ R9 N+ C9 U' X x
- if (not write_sql(c, req.text)):
- 6 ]4 n) g7 V4 y$ B4 y1 B, u8 V( B break
- 8 ]# e2 I, X0 l7 e\" c+ w& _. C+ _
- $ t9 e- Y0 F9 ` print("第%s页" % page)) {( l4 v7 A0 y/ x7 X# x
- page += 1
- 7 d: U0 W# \\" T
- 5 K3 H; F3 ]6 n' y conn.close()# r- j2 z ^\" C N
, u/ [% b/ J4 [8 l k
xpath用着就是舒服~
. }) w; x$ B& H$ l9 J0 O
i4 E3 q% G3 i. a3 k9 fPython交流群:1047602540 2 H. ~0 h% K# s' z# I( Q
8 \, r; m7 p- D% j$ q( F) C/ q( z8 _; e C2 C s2 C. X2 p0 M7 a
|
zan
|