- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python3 O; U( S1 ~: U; T! [) Y0 h. z, Y
- # -*- coding: utf-8 -*-) x( [1 q\" p) C
- ) \* ?' @ Z* m) s/ fimport requests+ C9 E. r) T, [3 w/ }$ Z6 X' o1 K6 `5 |
- from lxml import etree
- : b* Z9 }# V# H* u\" N' r; M+ `/ \import sqlite3
- v% [4 F& B. l* C/ d
- 7 ^7 I3 ^ T* X/ C: f8 a' l, e5 ]
- $ r( R4 z4 A- }\" p6 e4 a- P+ Fdef write_sql(c, text):
- % o3 g/ m+ [) g\" q8 f$ Q( P2 n html = etree.HTML(text)8 w0 l' r. A& w
- # 标题
- & D* |9 P9 b3 @3 X\" E titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
- 7 f. K( H: H* l; X7 l # 链接4 b; q5 i+ g, s% I P# Z
- hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href'). e$ J! g6 J- u9 W' E
- # 日期
- \" P# o! }8 J, x* q\" Q ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')
- . R) g ^0 f: K$ z% t& `. l* m/ \+ I! z8 A+ } W2 [
- number = 0- [ U3 ~9 I) R8 M+ [8 @' ]\" ~5 G
- for title, href, em in zip(titles, hrefs, ems):
- - | b3 J! `( W href = host + href
- 1 A9 t( y% |; P8 f( Y: A+ q. q cursor = c.execute(
- 8 _3 [. k! y$ @ "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)
- 3 J6 Y$ Y- Z3 ?/ |, v- r res = c.fetchall()
- 1 G3 b7 |/ M( P7 I) v # 判断该字段是否已存在
- 0 U! q\" H) k\" K% w* r\" S# h' _ if res[0][0] > 0:
- 1 w0 ], [. j8 a) Q) l continue
- 8 p4 Z1 m8 T/ Y7 n% U6 P
- / ?( N' J+ S6 z/ n8 F c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- , Y$ K4 e+ g$ m, | href,
- : q8 D1 {% p\" c, v) |5 C3 Q. K title.replace("\"", "\"\""),
- / ]$ w/ ^/ @' ^+ e! @7 Z! i em))6 a\" Z2 R9 |4 I2 s+ A- q7 N( k, n
- number += 1( Y6 Q, @1 |0 O$ _
- print(title, href, em)
- * K9 v. U! [ {4 x
- . L: S j0 t$ |; t. J conn.commit()
- 1 M- @; l d/ }2 \ return number > 0
- $ L# ]% M\" X8 {% n& R0 z' ^8 G' b8 C ^! P
- * E& I, |% J% n2 \+ H8 h/ z' Vif __name__ == '__main__':
- 6 p# A# B3 {8 w4 [+ N( l; V) a\" v6 F$ ^
- conn = sqlite3.connect("Python-xxx.db")
- # Z& g2 I6 ~- J9 p4 l c = conn.cursor()2 ?9 V, w& E! R! e* C, T
- c.execute('''CREATE TABLE IF NOT EXISTS Python (
- 6 p% c6 d7 v2 i2 \3 |% O7 l Url VARCHAR,1 ^/ N$ m* p2 | x( `8 M' X
- Title VARCHAR,
- 5 ~7 k- X, K w6 @ Author VARCHAR
- ; B5 r7 e4 M5 }# O7 Y )''')/ d e, h6 Q0 {& r6 O2 [2 t* ?\" F
- conn.commit()9 ?' V7 s, ]% y6 l
- + G9 \3 U. ~6 f9 }\" \& T3 W host = "https://xxx"# I6 G, v$ n ^$ ?. z. A
- url = host + "/xxx"7 M$ _8 `/ [, @0 ^
- req = requests.get(url)% U2 u4 e' ~- \: s- d6 P
- req.encoding = 'utf-8') A8 z; f; I8 r0 n
- # print(req.text)
- ; \& X; u4 o2 R8 P- h& \5 d- C8 k\" W3 h* E5 } K$ }$ Q
- html = etree.HTML(req.text)
- 9 x2 i8 J5 _9 Z' u4 o( L( q9 z clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')
- $ q2 h% D5 @$ N1 j% s( j/ l\" T hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')
- % A3 K# e; I0 m # print(clearfix, href)) N& R! f g4 I0 x( _, l
- 1 E6 }: t O5 {( w for clearfix, href in zip(clearfixs, hrefs):
- : c. k) r9 j$ z4 b' s; I, I print(clearfix, host + href), |4 B2 h' w4 o r6 N7 k. W
- # @/ ?5 n# a4 k3 Z4 m! |7 T ~: c
- page = 11 l7 ?& i/ K5 F. c& \- x
- while True:
- ! K/ I* D0 J# f' L, f: ^2 y url = host + href + "/list_%s.html" % page
- * Z' v4 S9 Y, J7 y# A# N req = requests.get(url)
- ; }/ @\" o6 v9 ] u\" p( c2 F* H req.encoding = 'utf-8'8 z9 r' u; j2 I
- , u$ H' J7 G/ q2 r$ W, Q' f if (not write_sql(c, req.text)):) C3 m' Y8 o9 q' W) m8 o
- break
- * W+ t& t& }9 D4 a, t
- # X- x+ x) [% `/ H6 w1 G print("第%s页" % page)
- 5 U: \; i8 O8 p c8 R page += 1
- $ Y\" a) J8 E# C- v' `5 x/ j6 ^) j, ^7 H* d
- conn.close()
- / V2 n' A9 P9 b- H
3 r0 @ C1 o+ I& a/ L) ]xpath用着就是舒服~
- d5 l" _5 K7 C5 T/ o' N. ~- S
' i! ~8 ?( `! R* [Python交流群:1047602540 ) }" T$ E" r( x8 T( ]( b) c
2 k: ^3 q. b4 `
# i, U( ?$ e: s0 r+ B
|
zan
|