- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python, U; Z1 p6 A( J( Y) m2 r1 f
- # -*- coding: utf-8 -*-# J# l( j. O; v\" w( P
- # d! S g% I; L\" B' Y- y
- import sqlite3
- f* F. ^; f- Qimport requests
- ; d: g' l4 i\" M* E& P: s8 y/ D# ffrom bs4 import BeautifulSoup c. I3 c, M6 Q6 {! L( F7 P
- from re import escape
- 3 L+ E# ^$ U3 n, a6 ^
- * q$ C5 X/ N4 d( g; I: Xif __name__ == '__main__':
- 7 \, o' d5 h- R B% l/ U conn = sqlite3.connect('Python.db')
- \" l8 L! M, L0 A; f4 x c = conn.cursor()
- ; C/ l7 p: _2 }0 f \. g a c.execute('''CREATE TABLE IF NOT EXISTS Python (
- . m) Q' m/ a6 s Url VARCHAR,. M& ^- x9 ?% {& I\" U: [
- Title VARCHAR,
- 5 S8 R. _: E1 ]/ Y) p8 u Author VARCHAR
- 8 B/ M b. [# V5 v! N )'''); F5 b1 F9 O1 D
- conn.commit(): J; b n: R2 K) C
- 4 G\" L\" ]3 G9 z& |\" C6 m
- # --------------------Split Line--------------------
- - w$ e9 r1 w\" u( C headers = {5 H( u' r4 Q1 b6 E( c7 {4 X
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
- \" O% a* @& F# @. @% s4 ? }\" Y, _# E, v1 d/ L# w9 K# \' d; ~6 l
- 9 T u F+ v1 O+ ], Q) o! r! w
- for i in range(1, 1046):
- r$ ]' n% r8 F/ `$ T5 W) I+ i5 k url = "http://xxx/index_%s.html" % str(i)
- % E! d9 d& T/ F2 ]+ C* @% H6 r req = requests.get(url=url, headers=headers)0 m) J! q$ n2 S! D9 a
- req.encoding = "utf-8") j) g. a0 O3 @
- html = BeautifulSoup(req.text, "lxml")
- ( p# m6 c8 ?\" m+ x2 V
- ; _2 W4 f& N# Z7 g! r7 u # --------------------Split Line--------------------/ Y3 B) @6 o4 a5 c3 ~/ h
- for div in html.find_all('div', class_='loop'):
- ; l9 @0 l6 l0 Y content_body = div.select('h2 > a')[0]
- , i2 B# U' X& @; O content_infor = div.select('.content_infor > span:nth-child(3)')[0]: F/ I% }, r' m9 w0 ]5 V3 G
- % Z' X9 d h# [0 F) B# H0 d
- # --------------------Split Line--------------------
- $ b* c0 \2 V6 @: O* L. @0 A7 S# f cursor = c.execute(
- - t1 x2 ?1 O k- l\" N1 x, G "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))5 M6 O8 k6 i+ O% G2 [; _1 u
- len = 02 n0 Z2 U- T( U( ] C+ ~
- for row in cursor:
- ( p) ~+ W5 D: ~# j len = row[0]
- $ z- f1 X* c\" I' I2 b+ f' q6 ^7 D if len > 0:; ]7 K0 w8 L& E: C
- continue
- % f8 ^\" K& J4 H- E1 ^2 A5 k3 D8 {: m5 G
- V; I1 s0 A\" s6 i # --------------------Split Line--------------------( x3 K4 x1 |$ t, Q& I\" C6 z
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (\" U, d2 r( W- I: K ?
- "http://xxx" + content_body.get('href'),0 d3 m+ |/ t/ l( c0 g\" O
- escape(content_body.get('title').replace("\"", "\"\"")),
- ! o8 P! }: v& i( r0 H content_infor.text.replace('xxx: ', ''))), y9 G2 m. L; ~( [* b' L
- $ Y' V' W\" }1 k. S
- conn.commit()/ v1 r: B/ J! o4 _1 g! D
- print("第%s页" % str(i))
- + f- j# }\" ~, h. h+ f$ n; x! o H& y* s/ T- T' V
- # --------------------Split Line--------------------
- \" Y2 A3 |& ^# b# x: h# B4 ` conn.close()3 i E- Y% k* L) w3 h: r6 B( g
) Q' d6 s7 m" a! u* X
9 q& X. f2 P* r8 F# GPython交流群:1047602540
7 K: [' l' ^) J6 Y4 I, \' @' p |
zan
|