- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python# }+ H7 G7 n: N( c2 e
- # -*- coding: utf-8 -*-1 ?. D, r+ ]* h
- 5 s/ O$ G( [4 y* U1 P0 t1 x, Gimport sqlite32 s9 @6 e+ `- J7 [7 R4 }3 e, |& a
- import requests' `7 g1 P: R. s( D
- from bs4 import BeautifulSoup
- ) q. }6 n; D0 {* s. l4 R1 q* U6 Z0 u- sfrom re import escape
- : h0 u& Q6 y' S# `
- ' r$ ~' L$ V0 {/ n- \4 Eif __name__ == '__main__':
- % o% t\" r. G\" Z. \: j& B# l conn = sqlite3.connect('Python.db')! |9 l' b/ y0 @
- c = conn.cursor()
- 0 b) _' m1 o6 b& g5 t c.execute('''CREATE TABLE IF NOT EXISTS Python (! R\" }4 s8 g+ Z3 Z3 }! J) b) V3 T, M
- Url VARCHAR,
- ! ^* ?4 V' M* d5 p Title VARCHAR,% x h( E) ~4 Z Z/ \7 E
- Author VARCHAR
- 6 N\" X& P\" ^& v) K; `( l )''')
- B5 i& \/ N# d conn.commit()4 |. Y8 w, q3 Q; D- m9 K
- * Y4 z+ l2 X: o$ Q
- # --------------------Split Line--------------------/ \ ^8 I( b) j; o' C
- headers = {
- 2 j; a1 b) u5 C' y% `0 u "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
- \" G( [\" q N2 t' Y# q9 C' b/ y }
- x6 O( |) {* J5 [* K) `' H1 d; J; J. s/ X. y+ B2 V; Q! E
- for i in range(1, 1046):
- / M* ]4 A- ^: o/ |: |. @. A) J url = "http://xxx/index_%s.html" % str(i)$ a7 h' J$ I5 `( H. Y
- req = requests.get(url=url, headers=headers)# B! c7 `- }2 e/ C9 o, t- I
- req.encoding = "utf-8"
- & O! a( ]* Y\" X html = BeautifulSoup(req.text, "lxml"). l* V s1 o4 P4 G
- % {9 u5 Z+ H) p+ [9 R& _# y # --------------------Split Line--------------------
- 7 o+ Z( k* B4 Q, b. l$ Z1 h8 B* F for div in html.find_all('div', class_='loop'):5 [. b$ z7 Z2 h* t4 ^1 q2 y/ w4 K
- content_body = div.select('h2 > a')[0]8 \* V7 w3 [& R- Y\" H2 f* I9 h
- content_infor = div.select('.content_infor > span:nth-child(3)')[0]
- 3 D\" K+ W Z R+ ?+ j& |4 D
- 4 b2 Y( R/ Z$ v, S3 n. q0 \ # --------------------Split Line--------------------% C) T! ^; w! G; y* h/ c/ V `
- cursor = c.execute(
- 3 f* K% ]0 e! M) ^ "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))
- ( y1 q( Y& e* C+ ? len = 0: ^9 l$ _, C9 D( m# Z' j% |
- for row in cursor:
- 8 J* {) [8 K7 i* X1 I len = row[0]
- % t- U5 N4 H3 `$ y* t. r if len > 0:
- ; T( q7 q+ h\" j; ~* i% M. e/ \+ Q continue3 K z$ [. M) ? C; q
- 5 x, b* e\" Y8 l7 l # --------------------Split Line--------------------1 e+ x- `' m; }% K
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- & T- p3 V0 o9 q' n7 F' q "http://xxx" + content_body.get('href'),% g6 E2 l+ Z) A4 C
- escape(content_body.get('title').replace("\"", "\"\"")),
- ) s6 S% J8 k; @, d7 ]$ L1 L( c8 P content_infor.text.replace('xxx: ', '')))' I( I: V0 \# `6 f# a7 p
- ! C) h J: n5 u& R
- conn.commit()
- ! H0 \; S9 y) g0 i% Z print("第%s页" % str(i))
- 7 P! O. f4 Y# l2 H; X8 }
- ! D! v7 ]) Q/ h5 M; f # --------------------Split Line--------------------% h' ]/ A6 J* P x; B2 f
- conn.close()
- * ]- K$ k( _6 t# ~: ]
' L _2 x; b0 `0 z9 x1 ^/ }$ ~$ D0 x1 ]4 W7 h
Python交流群:1047602540
. n$ N" K0 k4 I m) U |
zan
|