- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python# o5 X, r) J% A: b/ f* d
- # -*- coding: utf-8 -*-' A\" @9 X- }$ i/ A' g* M\" N) K5 F: Z
- + P7 K4 g' b: U# @# O
- import sqlite3
- 2 o7 d$ @* i8 c. k! l- d6 Jimport requests
- 9 [- _2 Q5 U, u; Ifrom bs4 import BeautifulSoup+ f% M3 b2 E! h1 c' r
- from re import escape
- 0 C; g: l* k2 @8 h3 e s- ~
- # f' X4 }3 I5 e/ Y) \& vif __name__ == '__main__':* y2 S' m; M3 `2 @6 Y5 q& s9 t
- conn = sqlite3.connect('Python.db')
- 3 g' M. M* J* L6 a6 p, z9 s& x9 K c = conn.cursor() q\" W+ {, M2 T) @ p& t5 J\" @. q* P
- c.execute('''CREATE TABLE IF NOT EXISTS Python (8 M6 ? V7 S8 ]
- Url VARCHAR,9 V* l1 Q+ S! \; U
- Title VARCHAR,
- 2 R' v. F, Z9 S% j Author VARCHAR
- . k1 c% ]1 `, u1 t0 s )''')
- ; G1 M$ g# c' o. j( W! z. D conn.commit()0 a4 \. `/ t+ K3 L1 x. w
- 3 u: \( Q) D4 y+ Y& o # --------------------Split Line--------------------& j; g, ~\" {1 l1 c: [3 c
- headers = {
- ) t, `! U, w, H7 ^ o6 t "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
- 4 `4 r- y9 D; y7 j }
- ! i, C0 U6 y) P7 H; g3 @' u/ b1 r: u$ M# ?. J7 s
- for i in range(1, 1046):! ?, h4 L7 [+ a. v# u
- url = "http://xxx/index_%s.html" % str(i)) d b. Z% m' ^$ d% j: ^7 l# N! }
- req = requests.get(url=url, headers=headers)
- . ` x/ r8 \0 F) v5 i$ |& H6 y req.encoding = "utf-8"
- & t; V5 L% P7 ?) g3 U* h html = BeautifulSoup(req.text, "lxml")
- $ X9 x1 }\" c. H P& k# A% ]% w; P! `* Y$ P, ?! Y
- # --------------------Split Line--------------------4 a' o+ i+ J' L$ v9 ~
- for div in html.find_all('div', class_='loop'):
- 8 K( o9 `3 \; L content_body = div.select('h2 > a')[0]3 y+ C/ n. A2 d. \8 ]
- content_infor = div.select('.content_infor > span:nth-child(3)')[0]
- 2 l3 w& B' |& m\" r+ U: B/ w% N9 a0 s6 ~7 w; V
- # --------------------Split Line--------------------. U- b/ R) S( P% ^\" L
- cursor = c.execute(4 q7 Q/ q4 ^8 J+ g! C8 K
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))$ @9 v\" V% Q3 \: E) |) F
- len = 0' K$ {- A. T\" Z0 H& `) k
- for row in cursor:# @) I6 _$ H) T4 x X' V4 H
- len = row[0]9 i\" a* \4 }6 l+ h' c
- if len > 0:' ^6 B r, c7 z4 ]5 Q/ U& r
- continue
- 6 ^9 i0 v\" }, ]# f; x' z5 n, E5 o) ]) K: X$ C3 k5 S Q
- # --------------------Split Line--------------------
- + B6 @* O; T! \6 Q6 H c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- 3 y \; G$ n, h9 {$ A2 z! V( | "http://xxx" + content_body.get('href'),
- . D* L- A- _, _* C- z; C& X0 E escape(content_body.get('title').replace("\"", "\"\"")),' }: j/ l( n' {1 o3 E) D
- content_infor.text.replace('xxx: ', ''))); R3 r) N. X) w0 H1 {( o H/ A! z
- + D4 c& z/ Q# E5 o1 v/ j. o5 Y1 M3 h
- conn.commit()
- 8 }\" Y1 R3 ~8 w& y print("第%s页" % str(i))* \5 ~4 h) P# p9 }* P4 Q% I0 Q `
- ! p: j. R4 h: m! j p A. _% S
- # --------------------Split Line--------------------
- 8 r; D1 ?% [# x2 \ conn.close()
- * l1 |/ {& t+ ?) S* L2 y
8 Q! x$ z/ f4 J# c: p: y5 z" ~
3 P& U! Z5 T! S# H9 sPython交流群:1047602540
4 p7 i9 K2 }+ _' i |
zan
|