- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- . l4 Q5 \! ?+ _8 D# -*- coding: utf-8 -*-
- $ g7 h. s. E4 I2 w
- 3 W( J+ t- K( [8 a# e! e+ I\" ?8 jimport requests& n+ `; v' T5 \) D6 c
- from lxml import etree
- 3 S' \9 `9 h+ j0 N2 oimport sqlite3
- & f( n5 \# @% l/ b\" U( t8 }; x\" ]( s\" }; s
- % W# x/ n _3 ~/ [
- def write_sql(c, text):6 F2 R! P8 o9 u9 L5 ]$ R. p
- html = etree.HTML(text)
- 9 K& k\" X p9 a: N& M9 Y$ j # 标题$ [% P& \6 F; Q! D( I
- titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
- 6 k. [/ V! f1 o- S# O # 链接- s- ]( E1 a* b9 o8 a9 w/ z\" j
- hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')9 B/ N0 p1 Q0 o) I
- # 日期3 d' J4 s% u* `6 H
- ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')
- : S8 n4 U, T1 W' _; `: h1 E/ a2 h/ |' L
- number = 0
- \" N% F8 Y8 K0 |+ b for title, href, em in zip(titles, hrefs, ems):
- 2 O# {& ~% P- a7 G- u1 [: e3 V+ W href = host + href
- ( e6 s2 V! ?& }& A9 K( g cursor = c.execute(
- 5 j\" r8 g6 R8 j( B; n "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)% q# N: p, d2 @* A( x; L l5 ^( b
- res = c.fetchall()
- 0 `0 C3 l* o7 t1 j% J # 判断该字段是否已存在
- + _# f5 }) b$ d/ \2 n* U5 q if res[0][0] > 0:# x; m: X6 Z; U; g7 Z* X0 `
- continue
- : {: P6 }, O- W5 f# R3 g$ G\" M2 q' j7 c4 t
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (0 h2 K4 A# q\" \1 m/ K
- href,( X4 N% Q. o7 E/ {3 |4 h' M1 ~\" f
- title.replace("\"", "\"\""),
- * ]0 `& ]0 X2 ^7 c em))' I+ C: ^) j/ Y' g* F; U
- number += 1+ g: p2 s& c5 g9 P% M2 }5 \
- print(title, href, em)
- 9 u( J8 W+ ^1 U
- + w( }, b7 y+ v: f e5 M conn.commit()6 q+ r* d8 f3 Z' X' C2 _
- return number > 0
- 7 I L\" C1 X2 T' r: X: u\" X$ p
- # M* K4 r9 A& p$ y
- if __name__ == '__main__':\" O- r* y. P6 E7 t* ]+ o
- 6 L* [: Q0 ~! t, f8 J% z conn = sqlite3.connect("Python-xxx.db")+ h; O5 \9 \$ |* f6 J
- c = conn.cursor()& L\" K! C7 l, y( O% }; w2 P
- c.execute('''CREATE TABLE IF NOT EXISTS Python (
- : D8 M: u3 s\" M7 X4 w2 R Url VARCHAR,
- # P8 _5 k5 i+ S) ` Title VARCHAR,
- : S7 b! \ ~4 p5 q Author VARCHAR+ T; k. A+ d$ B! C5 h
- )''')+ x1 K( w/ R9 H
- conn.commit()
- * O& w4 ~) {$ [/ ]4 h( L& T# K
- / t5 q2 ~' c! S9 Q host = "https://xxx"$ f c$ j6 b. t8 `6 z; |* u2 z# z' T' M
- url = host + "/xxx"$ B8 D# A T5 d: Z' b! i6 r
- req = requests.get(url)
- / J% G; |/ u; X. P3 n req.encoding = 'utf-8'
- 3 [# ^( }( o8 s5 z+ k9 G # print(req.text)
- 3 K6 H6 G0 [6 p5 V/ g' f2 k* B
- 6 \\" k1 Y* t- V5 e, C/ @: ` html = etree.HTML(req.text)
- * U4 F% a' t o/ Q! O clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')- D* d5 Y$ a\" z3 {5 e\" Z
- hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')
- 6 ?1 ~0 J; \* _6 U+ ?+ G$ Y& O # print(clearfix, href)
- D6 S; }2 [3 t\" ~# Y3 B
- 0 ?: ] [' f. i$ H* R% _4 s for clearfix, href in zip(clearfixs, hrefs):
- . K& y) _( A% c5 G, V! k4 j E; U! f print(clearfix, host + href)
- ! N9 n1 H4 S% s9 U( F# ]# V% L! w4 ]0 x( e\" M
- page = 1. r4 j h7 _6 Z( I, `! p8 U
- while True:
- . u& w+ j, }5 Y) O0 p. r1 ]' w url = host + href + "/list_%s.html" % page* q( k% i7 Z+ X* k
- req = requests.get(url)
- ( K0 _' Q: ]1 S' I7 I( |\" K j req.encoding = 'utf-8', w# j% y' }/ D; t: L
- \" o0 P3 X8 }+ u\" s7 W1 _7 |# Z( {( C0 J if (not write_sql(c, req.text)):
- ) S: N2 X7 O- N3 Q: g. q break
- * n. }5 _. y3 {\" i& x# p' a5 l8 S5 J `+ Z8 q# h9 N\" n$ S( y
- print("第%s页" % page)
- z @6 W1 C3 R& |* L1 V page += 14 X: D: C- Y$ v0 ^% y1 U# j' ]
- 0 Z3 D; _( G3 M |. C conn.close()
- 5 d3 M& W7 _7 f\" a7 R, F0 ^6 J) K
& f" C: h5 b; O
xpath用着就是舒服~6 ]8 W+ |) a+ A* v4 i; L
( n$ `2 r& ^& T5 t8 APython交流群:1047602540
8 v6 A( s( `9 F9 Z2 R E2 U6 l* p1 n& H
0 r( o$ r& \8 f! Z6 \" R
$ Y5 n% Q& t/ E$ E2 @1 I0 a- H' P |
zan
|