- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- 4 ]& P4 \\" r; v4 h# t% B5 P+ `# -*- coding: utf-8 -*-0 Q: e- G# m% h, d5 [, T
- 1 G% b: b7 h0 a4 S m3 simport requests/ X: d6 u1 X; ^1 `
- from lxml import etree! u\" p) ^! i5 r+ s Q3 C9 q
- import sqlite3
- 2 c8 q0 @2 _: g b% c7 \
- ' K# ?3 ~4 v: n2 o* M9 X$ u& R' S+ u. b: F. C( e
- def write_sql(c, text):/ G0 u( h: g2 l# B9 u
- html = etree.HTML(text)! @, z% m! T6 _9 ^& O/ b* E
- # 标题
- * o5 C; g) ?, B titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
- 3 }0 e. C# }. \; x2 o. i+ p # 链接& _( N. q1 l4 `1 w4 W/ W
- hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')
- , C6 q9 g* i4 E! [7 G! c # 日期
- ' r2 J3 W/ x w; s ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')- C3 k& F& U$ P$ ?* \4 l4 C
- - a3 x0 X5 W6 q: U( O\" I, y; O number = 0: W8 m$ R\" i* _6 T\" J! B
- for title, href, em in zip(titles, hrefs, ems):
- 7 w2 Z* s3 d& Y3 U* S: b5 o href = host + href
- 7 s4 M6 o/ t3 h+ C4 ], M- E cursor = c.execute(5 u( \$ m# Y+ p' u% a! K( y/ c
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)
- 2 N$ W; Z9 W6 b% o9 Q2 | res = c.fetchall() ^% B% G& j6 ]+ n0 H6 p$ U\" ^
- # 判断该字段是否已存在# F1 s! {; V0 E# E
- if res[0][0] > 0:; q: _$ I0 q: U. D7 s7 Q8 R
- continue4 s* i\" c/ A+ G C5 s
- ! }% L3 j$ z2 n7 J' R8 h/ h# W
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (5 p. }9 N n% w( f& F& Q4 P8 ^
- href,
- % T9 |% B6 Z4 A! @5 D0 X\" M; w7 z8 v title.replace("\"", "\"\""),( d! i0 x! B- D3 I
- em))4 v$ O, \0 n0 e5 U' B+ Z/ d. [
- number += 1
- + R% L# e6 D& K\" m0 G/ l! | print(title, href, em)
- * ]* X0 q. L& X6 Q3 I3 E% ^% {! i+ M; F6 W8 [! ?2 U
- conn.commit()
- $ I2 z\" |7 y0 [ return number > 0
- 3 _( `' Z+ \4 o9 G% q ^' P! @* ~4 s# b: W) x\" G; X. v1 z
- {- c5 }! x' g& i8 J$ V- fif __name__ == '__main__':
- / P& O j& K+ y' o/ N9 Q! I' F% T5 g P
- conn = sqlite3.connect("Python-xxx.db")3 v) `7 ^ E- i* g
- c = conn.cursor()
- - H& c\" _3 L, J\" O- | c.execute('''CREATE TABLE IF NOT EXISTS Python (4 D- w# j) p8 R7 f/ w
- Url VARCHAR,
- t& h5 n8 o( i5 V$ e Title VARCHAR,
- - B, R\" E7 d& B2 d! L' M7 b Author VARCHAR- \8 m* K* D% Z7 ^
- )'''). _- w& x: ?( Y
- conn.commit(); V; _, u\" ]* t! ?# ?0 Z( e
- \" G: l: m% G\" z: a9 J
- host = "https://xxx"% E4 W3 J( q* z8 W% t0 y! N( t# U
- url = host + "/xxx"! K/ X+ M# Q, w$ v1 M
- req = requests.get(url)
- 5 S. k1 R) D7 I, s req.encoding = 'utf-8'
- \" I7 i9 p3 V) X\" p\" u+ d # print(req.text)1 m- b: Q- J1 [, O0 h
- : f' J! U5 l6 u! x9 j7 N3 H
- html = etree.HTML(req.text)\" f- F1 |' s0 K
- clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')3 T, q- m2 _* s9 F$ m
- hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')1 T( Y0 D. y! c
- # print(clearfix, href)' D3 H/ c4 j% P8 b+ L
- ' o: j f3 z. O% S d, b
- for clearfix, href in zip(clearfixs, hrefs):
- 9 ]' N( F* K% z print(clearfix, host + href)
- ' @, s a+ P* s# m/ I7 M- s\" ^+ h\" @2 x: L- |+ p
- page = 1
- 1 T7 d9 h1 c( L2 L while True:
- * G$ X+ A% @) [/ K8 A8 o; v url = host + href + "/list_%s.html" % page
- 6 y4 f1 I b! O. ^ req = requests.get(url)
- 1 K+ q3 d* F; Z7 |2 F& W. L req.encoding = 'utf-8'
- 6 [4 p; Y7 e2 B9 O# ^$ I9 d
- ; F) \3 r' Y4 q( E\" [ T if (not write_sql(c, req.text)):* |) U& h& d# z5 G& g$ r: x R
- break3 ^0 L& v' }1 J: L- S3 a/ a
- - Y0 U$ J8 Z! Q
- print("第%s页" % page)
- 2 @) V' I. z( o page += 14 B6 Y+ u7 I; a8 p0 u; @$ t- p( o. h
- # |! \\" s) J6 M) ?( F
- conn.close()
- ) h( U4 d% K2 _\" n4 a& G
7 T) @' n( f- P+ r: n; _xpath用着就是舒服~
" x' e" j* T. _% U {! l; a$ ]
Python交流群:1047602540
6 U8 l" i$ B5 {+ ~7 r2 z+ j8 g Y# g5 O; j, s" H8 o" D$ [
) Z" t; Q8 x- g" X |
zan
|