- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- 2 G2 y1 c# l7 h\" s\" \% u# -*- coding: utf-8 -*-. E4 z+ b2 v K, |+ t3 y
- 2 d; z6 R# P8 A& e$ s- U6 himport requests
- + d8 ^3 d( G O\" C0 E# Y& Ffrom lxml import etree
- ' Q5 `) [\" h* V( w1 v5 Bimport sqlite3/ S6 S' p& _% z5 _- `* S3 C7 s
- * \- \0 L$ x% m, N; q1 q R$ `, x% x( p \3 ]4 `; ?; z! y
- def write_sql(c, text):
- Z8 ]! N `\" \% ~$ S html = etree.HTML(text)$ Y! b* u1 u+ ^
- # 标题
- 9 L, m3 x3 y9 t4 P titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()'). d4 T! _' D\" Y& M
- # 链接
- 9 \\" B# E4 n: k\" N hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')
- , I4 O! _% i j7 C2 R6 m7 ^; l # 日期( c* n: x6 C2 A5 L\" X
- ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')
- % i0 C* l5 E \5 `/ D/ Y
- J$ |+ l8 E$ D number = 0# M/ ]) S6 W4 G2 h
- for title, href, em in zip(titles, hrefs, ems):
- + o; V [& r& a* G; d8 D2 Q href = host + href% W) ^# {) p: ?. j/ N
- cursor = c.execute(& @4 m$ P7 A! c3 V3 l; w/ }- ]0 l
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href); x$ e( o, K6 t( w5 ]( K( V$ J' i
- res = c.fetchall()+ Q: f6 ]( U6 M) d% G
- # 判断该字段是否已存在
- & K+ @8 q! g1 F' `\" J if res[0][0] > 0:9 ]8 h7 f4 q9 O' x/ O, X
- continue
- / ?7 d4 r, D- i! }6 [! H
- $ B; W$ I2 R$ U* X6 ] c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (7 y2 W( t6 q/ z4 A
- href,7 @! O0 r: e; I+ R- ~6 X, H3 E
- title.replace("\"", "\"\""),
- 5 R* ^: M. S5 y\" y7 u: ^ em))4 ?1 z8 p& W/ r8 g
- number += 1
- 2 u. c\" |; b9 ~) U8 ` print(title, href, em). j; R: x Q0 F! a5 }
- : E# ^- E2 [! m0 v- U3 J/ m, k) _ conn.commit()- M5 z' Y/ S! Q2 G
- return number > 0$ Q3 N2 { Y( ]( x: Y
- 5 g& I* N* x6 v/ o% m0 J6 ]5 V( {& I\" u9 I( x$ d
- if __name__ == '__main__':
- ! S/ X4 L/ j+ y0 D9 `9 v
- \" A' M/ I$ c\" B3 Z conn = sqlite3.connect("Python-xxx.db")1 W# x* d0 h$ u/ Q. C* l, K- K
- c = conn.cursor()
- 0 h8 m3 q$ \2 W c.execute('''CREATE TABLE IF NOT EXISTS Python (
- & S+ w3 y8 O1 Z0 Z\" |2 t8 |. B1 J Url VARCHAR,
- + r! U& O g) H5 l/ I+ w) T Title VARCHAR,
- - i4 w) X! _0 k+ L( R Author VARCHAR
- $ F4 V6 C& h( v O )''')
- - j- C$ s5 Q\" n7 c* E' V- T conn.commit()
- 9 A' i( a: R! U% D8 H
- 0 d: C6 r2 b. r$ a host = "https://xxx"\" K4 X) {( q: M. g* d% Z; X
- url = host + "/xxx"# c5 m+ o- e# O6 [/ j1 N% T q3 u+ |4 U
- req = requests.get(url)% W0 N+ L1 T; w' _) l: t
- req.encoding = 'utf-8'# ?0 ]0 `7 q* A6 b1 w! ], Q
- # print(req.text)
- + @! n' }, J9 [\" z6 j2 w: ]+ U. p\" N
- html = etree.HTML(req.text)
- 0 u2 q! J; i6 }4 ^1 [\" d clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')3 e8 W0 U% t# b- ~! r2 ]) C
- hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')
- $ q/ B/ W3 {6 E # print(clearfix, href)3 P+ e. _* j! \3 u) i, R
- * j; m* R* m! G) w, q) p! w
- for clearfix, href in zip(clearfixs, hrefs):( W6 S1 ^6 I$ p8 t
- print(clearfix, host + href)! k' i\" u# l2 x2 i6 t
- ! P* U+ }. {& |; e! \+ X6 p+ G) o0 }
- page = 1: D: A2 h W7 G8 R6 V; ]
- while True:1 D8 y8 s! A3 [( O
- url = host + href + "/list_%s.html" % page6 m( z6 q' H7 w
- req = requests.get(url)9 R: y* Z- {7 ?' t0 h' ?/ C
- req.encoding = 'utf-8'' m$ W3 Y7 I5 Y2 W
- # s; l6 G U8 J6 o
- if (not write_sql(c, req.text)):
- ! X9 k! j% h$ I8 @- b0 d H break
- 7 s/ J1 ?8 f+ j- \! U4 }+ o/ f3 G7 X* V( N2 a
- print("第%s页" % page)
- ; p5 l; f8 v3 q8 i9 J page += 14 Y3 t. q0 X* T, H7 ]
- ; |9 k9 @4 ]9 j' U0 x! m: O conn.close()0 S* x! F$ U p) j. {/ c6 H* p
5 A, I4 K8 I, t+ i2 yxpath用着就是舒服~
4 e5 y! Y8 N6 C7 X; L' [4 k6 c4 U" s& R y
Python交流群:1047602540 ; @' Q, d4 Z8 K# t+ K
1 w6 S0 J' D6 [. v
1 S1 v9 w$ M" Q8 R, a5 U* @. E
|
zan
|