- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- ) t2 L- U$ P u9 E% M( s4 g7 j# -*- coding: utf-8 -*-\" Y8 Y0 G% {* R
- ) k$ k\" I4 i, @! p' U2 p- Pimport sqlite3
- 4 n4 S0 j$ I. }2 ^2 \2 k! qimport requests
- - P# r2 D& j7 j7 ^6 Ufrom bs4 import BeautifulSoup9 s/ `# n: G5 j
- from re import escape) Y* b7 m$ B9 u- _; {2 \
- : c/ l\" c: O8 y; S/ ]
- if __name__ == '__main__':\" Y( U9 `6 W+ s z1 m d; n- G
- conn = sqlite3.connect('Python.db')7 Z' \! x3 d/ a5 i) i& ]3 \$ L
- c = conn.cursor()
- ' ~& D' v# \) U& L1 k B c.execute('''CREATE TABLE IF NOT EXISTS Python (3 C& |1 _1 {4 v- [6 a5 o4 M' G& g$ l
- Url VARCHAR,
- 9 e) P0 z0 [. k. I# ?8 u( |8 g1 S: m Title VARCHAR,
- 8 {0 A6 J M* P r7 O7 R l Author VARCHAR( t/ Z2 o+ Y/ E2 I2 {) l# j$ g
- )''')
- & B/ n' a1 N, E# ]( B! T conn.commit()/ g) d. O, C) A' f
- # O; F* d$ r: ]/ G( d. v0 x' t
- # --------------------Split Line--------------------+ ^1 w- s; q: o s. g
- headers = {
- 9 q& b. d5 i, V( P "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
- 0 j6 p8 ~ I1 c+ r2 M }
- , S0 X) \! O M a+ \) _' s. p! T, g/ e9 o& C
- for i in range(1, 1046):4 w. v9 E% o' M* b
- url = "http://xxx/index_%s.html" % str(i)( u) x' Q0 c\" F; N, k$ ^( m% ]$ f0 A
- req = requests.get(url=url, headers=headers), C5 y( l# O9 c8 S# T
- req.encoding = "utf-8"' ?. [1 U. U1 T
- html = BeautifulSoup(req.text, "lxml")9 y5 U2 ~! z- F9 v6 b0 P
- * K# t1 R& U: L6 `. @. T # --------------------Split Line--------------------
- , T# {/ h g4 u1 e( n; X, F! e3 \ for div in html.find_all('div', class_='loop'):! Z8 N) o9 A$ r# ?; N+ b H+ p
- content_body = div.select('h2 > a')[0]' o9 i6 Z, N+ ^+ I% f* P2 }0 I
- content_infor = div.select('.content_infor > span:nth-child(3)')[0]
- 2 o' f. C& y* Z1 Q\" x\" k
- ! t3 A# [9 _) w # --------------------Split Line--------------------
- - Z* s. q& Y. n( E/ ? cursor = c.execute(
- : |8 a) \% ^+ R8 M "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))
- ) q6 d, |( @; ^* L# d7 i7 J/ y len = 0
- 1 t a% x2 e5 T, r) K# l$ J- P for row in cursor:/ A* @4 l\" F. i. c9 |6 T0 p; G: C
- len = row[0]6 ?& f+ r6 ^5 N. l! }6 w0 _
- if len > 0:5 ^! o! e8 d' u
- continue
- 5 b, u& E2 `/ E; N( o3 I$ Z8 F
- ' ~0 p# {\" T\" q' D' |: ~9 m* ~ # --------------------Split Line--------------------- w# R7 {$ \$ x
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (# i* u2 T4 F9 K7 b- u
- "http://xxx" + content_body.get('href'),
- 2 Y; F i8 R- l% o Q escape(content_body.get('title').replace("\"", "\"\"")),
- 2 s r# r) m# X- Q# N6 g) h& N content_infor.text.replace('xxx: ', '')))3 u ^% P4 s\" r, k- T! {
- $ T4 V$ z1 P/ L- Q: ^
- conn.commit()
- ( \6 T N6 e6 d* Q9 @ print("第%s页" % str(i))
- - H% A& H' C- K
- 6 t9 e) V Z. B3 ]3 r1 e X # --------------------Split Line--------------------' a* x% H }. S$ L
- conn.close()
- . q6 S) R4 K$ f8 j
0 w# b, ], ~4 N( Y# l( X
! B7 F. k9 s6 L2 BPython交流群:1047602540
4 Y2 }, N5 f. e s/ \3 @5 A |
zan
|