- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- ' n8 h$ Y% _\" [# -*- coding: utf-8 -*-' Y6 A$ Z: l& m' b
- # J7 _3 s$ |, b1 c. M& Yimport requests
- . W5 t\" n4 f6 Mfrom lxml import etree
- % j8 ^+ l2 n* G4 b8 I \import sqlite3
- ' @# |+ N7 o- n6 m B F
- & G: e& }& |* T' }2 I$ E) l f, d/ n4 T! [! _( _: E9 X# {) [6 x( K' K
- def write_sql(c, text):; z! u7 Q5 a\" S9 T6 {
- html = etree.HTML(text)1 R2 g6 S/ Z2 j0 u/ m8 H
- # 标题 b) x1 R9 @' w7 e
- titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
- 7 n* W1 t' l1 F7 e; L0 x # 链接+ v* t- i2 u- i
- hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href'): h4 _& |- ?; _& c
- # 日期; h/ u/ R$ K# A) ?* Q- I
- ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')+ ]/ v! i: p' D5 m% p
- ! |* A7 P/ ^% }5 ?
- number = 0
- ! L$ P! D( M) g4 I for title, href, em in zip(titles, hrefs, ems):& C! V. M5 W$ m' @5 P
- href = host + href7 \+ T, l* M8 |9 D* E9 l
- cursor = c.execute(8 T; t! D- |0 I) S( O2 m! h
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href); g! j1 |' l' u1 G' W\" n
- res = c.fetchall()
- 8 P, V- l5 y0 _1 _- q: e* H; O # 判断该字段是否已存在
- / m4 ^3 \4 e& _ _% @% m- i\" h if res[0][0] > 0:# P; ?! E( {) C* @) |( X3 ^
- continue
- 6 ], [' T9 ]: Q4 W: l
- 2 ^5 R. g. t5 y* Y( G) P$ o* K c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- & d7 |2 B8 [7 f& J href,
- h7 b3 w) o A3 c title.replace("\"", "\"\""),
- ! N- y; j+ x1 R$ c4 N em))
- . f0 z* {& _% m8 s( `7 R$ | number += 1
- + X3 U+ j+ j& _9 }' ]6 ], |3 l- M: C print(title, href, em). e! [/ S+ k% r
- - }' e A9 b \\" { conn.commit()
- $ b7 u; U& `/ r: i0 y3 T return number > 0/ U, |$ J1 ]# J t! u, k0 c
- - e5 `: X( Z: c3 @+ R% b
- 5 t0 Q- }9 t2 _0 @$ B! {; I8 Bif __name__ == '__main__':
- 0 |4 w+ h. e\" ~; ~1 D\" v
- ' C9 j8 K( {, B conn = sqlite3.connect("Python-xxx.db")9 b\" i) g% R/ F5 [- s$ Q
- c = conn.cursor()+ ?\" o' v6 ^# P& W$ u
- c.execute('''CREATE TABLE IF NOT EXISTS Python ( W% ]& N\" U7 Q, `% l, P- L) n9 b
- Url VARCHAR,
- 7 r! L% C3 M9 X- H ?. n/ _: X* Y Title VARCHAR,; ]' e( R8 H3 X' Q2 y1 a
- Author VARCHAR# q/ p) Y0 h$ V7 v, p\" k
- )''')( o* g0 A8 a' v/ o+ K* F
- conn.commit()
- 8 z; _+ M, |( X1 I2 \( T( S2 A4 k% n. h5 V7 |$ R
- host = "https://xxx", s: Y: q% x/ B/ v
- url = host + "/xxx"
- ! E' K# r# K* N) c: P req = requests.get(url)6 G% Q5 [\" q1 a, e6 ]9 q. u
- req.encoding = 'utf-8'
- 1 A; f9 `+ p. e [7 Y # print(req.text)0 \$ Z8 v* k1 ?) R, H
- - @5 c7 u! R4 a; e6 h7 D html = etree.HTML(req.text)
- ! I: F0 |! O; ` clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')- Y8 `( l- S0 E3 l, M& R8 f N7 E3 Q4 B
- hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')
- , S6 b2 `% l# v/ u\" e+ g # print(clearfix, href)
- ' R: x2 ^7 u) X4 P4 {4 \\" y, h/ e, D) n
- for clearfix, href in zip(clearfixs, hrefs):
- 8 g' v _9 X4 l$ V, ] print(clearfix, host + href)
- q0 k. j! _3 Z: V
- ( Y* w+ y\" h. V) W- j5 x page = 1
- : Z8 J4 d% `\" I while True:; K\" R1 ^0 G4 ~\" W) R$ |& ?
- url = host + href + "/list_%s.html" % page, L8 y0 R: s% N' x, m% ?
- req = requests.get(url)) ~6 N' t/ u9 c
- req.encoding = 'utf-8'
- 6 {5 V: P5 Z3 w9 X8 R% y; E7 V( I4 O0 F$ U8 _9 z d
- if (not write_sql(c, req.text)):
- X/ s3 [1 a; K1 i- N0 J, X8 @! \! [ S break
- ' Y5 t4 q0 m$ H* c# A
- * r: E: _& f\" p print("第%s页" % page)4 X. C& P* u6 x
- page += 1
- ) j3 @3 g& Z. ]\" `4 p; o) j O, S& j& n5 X- F7 v
- conn.close()% _% I$ P* P- v( l- J
7 |2 R, W' Q& D! z9 R
xpath用着就是舒服~
+ z3 }/ e1 \4 K m: [8 T. p. [2 p7 `& Y, N2 w4 Y2 W
Python交流群:1047602540 3 Y) M! N4 K7 r8 A
1 F; B$ `" W4 L. I0 g
0 W8 a- h& U, e/ f# W1 a- Y |
zan
|