- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- 1 f2 R, }1 a, `1 D5 c# -*- coding: utf-8 -*-5 c$ i3 s3 y2 G* \5 `6 T
- - t( _( d( h0 H3 F* z, f9 R8 v
- import requests3 y5 d3 Z% o0 j4 Q
- from lxml import etree
- ( C1 o d, E; {% I% k$ Vimport sqlite3# O6 ~9 r' S/ P7 e1 Q+ K
- 9 M7 C2 V4 }- w; q D
- # j1 P1 q# |# Y\" q% K p1 F0 @1 \9 T5 v. vdef write_sql(c, text):5 f: _, P/ u9 u5 q) s/ {1 e5 |$ ?
- html = etree.HTML(text)
- * w( ~+ h8 k, ^/ j# f3 i, T- @ # 标题
- % D' i8 U3 K5 [/ M' s8 h titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
- ( G% C- K0 q2 x( ]* w # 链接1 x4 k7 `9 F# Q* g( `
- hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')) X+ y& U4 Z' g% b/ H- u3 P
- # 日期5 ]7 G; w0 {. F' R
- ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')5 a1 ^2 a# B% }( w8 f$ K i\" q# S
- % Z, E& M2 l\" _% d7 u2 t I8 M! k number = 0
- 7 r' {( n1 N f8 I2 C for title, href, em in zip(titles, hrefs, ems):
- / r: A* u9 ~0 Q. {1 L# H href = host + href% ?, `6 ^ \+ k
- cursor = c.execute(7 e7 [- {; q* ~ L
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href): Y& H, M/ m/ L* i0 b% x. j* U
- res = c.fetchall()
- + G l3 A6 u% o2 t # 判断该字段是否已存在
- C. S7 g& g0 |/ I9 U% c; j if res[0][0] > 0:
- K! \, Q& l6 B continue
- : @& a+ K! }2 P/ M, r4 K1 ^
- 1 X0 p\" x2 U2 P, M c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (0 y\" _. N. n5 R$ h `2 |% E- X- e6 p
- href,
- , M. n6 W$ ] @ title.replace("\"", "\"\""),\" u. H+ s3 L7 ^# r' K( V+ }
- em))
- + C! B* l: S# Z, n9 X number += 13 C- G! E: E* J7 O0 ^
- print(title, href, em)! j: e, a\" u J
- 6 r\" I( x8 V0 A0 ?2 a- O
- conn.commit()
- 6 q# S/ f/ O3 E6 @ return number > 0
- Z0 m l/ B |4 R+ |- u2 f1 W& H( b- x4 n$ j( g( x
- 5 O1 M) Q, D9 x/ G4 |, @/ E' G\" bif __name__ == '__main__':% c; u\" r! O* ^/ j# H; q3 c0 L
- 0 d: P) T+ q* b9 _7 n2 n$ M conn = sqlite3.connect("Python-xxx.db")
- $ S% y\" C& p- Y* L) H& }3 f% ]( K c = conn.cursor()8 l' O. i% E8 B5 O3 s+ f0 P
- c.execute('''CREATE TABLE IF NOT EXISTS Python (
- 1 Y1 `+ o R\" c7 S4 ] Url VARCHAR,\" a. K( E6 W x8 I\" P, j! `
- Title VARCHAR,2 p; Q6 z: J; D$ y+ L% O
- Author VARCHAR& H- |$ v! v6 P8 o$ U
- )''')
- R( |1 z- t. p F conn.commit()0 s0 I5 _2 I8 f5 H6 T
- + ^6 o, Q1 o$ ^; w( u host = "https://xxx"1 T* Z O H T9 Z% o
- url = host + "/xxx"/ H9 W$ q; y* r' [
- req = requests.get(url). k4 t3 C- n* ^8 | J+ D
- req.encoding = 'utf-8'
- 1 O2 ], r( e9 I7 ~- h# u* t # print(req.text)
- ; G2 h6 u* P6 E5 ?+ ?, y. w2 |1 i2 J# ~
- html = etree.HTML(req.text)' M9 s }, K2 ^9 |8 F: @7 e
- clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')' e3 u1 _& [9 @- b' h
- hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')
- 5 X5 ?8 K- y\" k( x, m# W # print(clearfix, href)6 p( W6 L; R& H8 w: ^: q$ z- b+ n3 P
- 2 V0 h( @, U' s9 ~% Z1 {; j6 X& ~
- for clearfix, href in zip(clearfixs, hrefs):
- % z4 b8 ]' t {2 F. y: [ print(clearfix, host + href)7 X0 k& E) [) f\" [0 [; V2 a/ w
- 7 D7 k. A D, O page = 1
- 2 w8 l j: Y m3 R% w! k! P) ]; Y, m while True:$ ?\" k8 s* W7 Q0 }\" L
- url = host + href + "/list_%s.html" % page
- ) q/ `/ b1 Y; | req = requests.get(url)4 q; V- O. B2 A$ x/ l& n5 y) c; E
- req.encoding = 'utf-8'$ P- _* y- C: o V7 N6 \
- \" P6 \\" _' {\" a, I- f if (not write_sql(c, req.text)):
- 5 \- M- a6 }, ?- M+ i break7 i$ p2 Z' f: e& I1 {* I+ I, w E
- . _0 h\" W' r1 _' J\" V5 i t) p9 |
- print("第%s页" % page) q' b1 m( k2 Z5 _) u& l0 G- \9 K
- page += 17 j& n. C) Z. k7 j5 n B2 ]
- 0 W' f! X) K5 e' ]3 M conn.close()
- 6 `0 y0 C3 Q+ A' r/ n
' j, |5 y; ~: ]1 r. W) \* `xpath用着就是舒服~2 l6 X& E- \8 U X' D
- R" C6 y Q6 L4 G5 w# qPython交流群:1047602540
" u# E7 t$ ]& y, A9 }% R5 w( X& i7 L: C7 l# u3 E% i5 a" G
- T+ h/ T8 A. f: {) q: d* P4 ^' z |
zan
|