- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python$ G$ Y; a( I, ~9 J: U
- # -*- coding: utf-8 -*-$ m# n$ s- v+ w5 L$ R& v3 |( i
- 9 d0 Q, Z o+ W; w# Uimport sqlite3
- + t% g\" y6 E* [5 b2 H* A, a4 Bimport requests f/ f& n. Q* H$ b) M! W
- from bs4 import BeautifulSoup
- 8 W: m. R7 X1 o! L! L+ X* | Kfrom re import escape
- . z8 I& X% `. U O `/ q' s
- 1 y/ N) o1 q9 K\" t# |5 Aif __name__ == '__main__':
- 4 f* P' s\" N3 a conn = sqlite3.connect('Python.db')
- : y& y! ?+ }) G7 t7 y c = conn.cursor()6 t3 m6 n4 u9 P
- c.execute('''CREATE TABLE IF NOT EXISTS Python (
- 9 G/ n\" f# J\" H\" J- ^! a6 v& G Url VARCHAR,8 z( E- P1 Y. {3 v5 ?. z
- Title VARCHAR,: Q9 p2 S( ^) }/ a* G/ v. }
- Author VARCHAR* e& m3 O# Y4 b& C; T! i6 [
- )''')
- 0 h) H2 N& e1 W+ m$ z( O0 T. C\" ? conn.commit()
- \" Y. @6 k3 K( c; @ g, Y. @* A7 J% x' z; F% T) I$ d\" U+ R
- # --------------------Split Line--------------------6 H, j. p6 J: M. C& v
- headers = {
- ! Z( g4 e6 j. P1 ^4 ^ C "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
- / w* I$ L6 M$ \; Q( b) ?! O6 h }
- 5 A( o& q% h2 E7 \& I1 F0 }1 B! `4 E: J, b4 _
- for i in range(1, 1046):
- * Q\" x5 K# C Y url = "http://xxx/index_%s.html" % str(i)
- $ ?5 h( P' C4 _7 G req = requests.get(url=url, headers=headers)
- * M+ w6 h9 B `( m& V0 D e. L req.encoding = "utf-8"
- 6 `+ m* e+ @# R7 H html = BeautifulSoup(req.text, "lxml")
- : s, |0 }) H/ h* J% j' ?6 j
- 2 l' n# D\" g\" e' X: { # --------------------Split Line-------------------- e\" n( s9 p9 C\" c& ~
- for div in html.find_all('div', class_='loop'):
- 1 W/ j! Y, \\" b) Y3 H9 G9 l8 T7 C content_body = div.select('h2 > a')[0]
- ! ?) r8 Q. J5 f: h7 @9 d3 A content_infor = div.select('.content_infor > span:nth-child(3)')[0]. ]# }9 c6 k/ Z( m
- $ x\" O; B p8 L+ E3 n* E
- # --------------------Split Line--------------------
- + j4 M/ Q; Q- ?! \5 W( g) Y, R cursor = c.execute(8 Q8 c\" f! T/ N+ W, \& S
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))/ v l4 ^4 P( M% m
- len = 03 s: i. }# e) B* o' l# }0 D5 g
- for row in cursor:2 K2 _0 J- g! S7 q
- len = row[0]' I. E' \: a4 P2 G% w; S
- if len > 0:& O% Y$ U# `* N p# i- a2 J* v
- continue
- \" d& A, f, k4 X: ?
- E. G) w5 s( }. X5 y1 [4 D' V\" m # --------------------Split Line--------------------
- * {# h! c8 x5 S4 f' W c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (7 Y0 N( x1 P1 U
- "http://xxx" + content_body.get('href'),( j. X8 e\" R, F* A
- escape(content_body.get('title').replace("\"", "\"\"")),
- 0 V9 y9 s\" L8 n8 M s& ~ content_infor.text.replace('xxx: ', '')))
- 4 ^( g- M0 c# q
- * B ~2 u; V' P4 u conn.commit()/ Z) I\" s8 \( e- G/ T
- print("第%s页" % str(i))
- + _8 ]4 S! Q# ^( k
- 2 C2 j# g5 _! y1 k4 R3 ~0 n # --------------------Split Line--------------------
- 1 x) S: r2 J9 \$ r0 o\" D conn.close()0 J* u0 @! V8 ]
! s3 A* ~2 X: L+ k! b# h
/ X+ G# ^% C2 h2 X& P& RPython交流群:1047602540
+ u0 ~* }' `1 h6 U( i2 [0 l6 b |
zan
|