- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- / P\" V) O7 b3 D- B4 X# -*- coding: utf-8 -*-
- ! D+ H+ C: H* f( e g; e: I! K
- ( s( C, O1 `( u& p# s Mimport sqlite3
- ! i/ M, T# j0 H* Eimport requests7 _7 d6 i9 W' m2 u C
- from bs4 import BeautifulSoup4 c$ G) @; b3 {& S2 @. }. c
- from re import escape( G0 ~8 P1 }* d, D4 P( k
- : d7 u- g+ z; R8 y8 o( k
- if __name__ == '__main__':
- * l# f) Z\" k6 k8 j conn = sqlite3.connect('Python.db')& d4 ^3 i5 \/ P# G( Y
- c = conn.cursor()# T' W# Q9 q; ~# x& C/ b
- c.execute('''CREATE TABLE IF NOT EXISTS Python (' X) G6 g g2 j3 p5 t2 H S& F\" {
- Url VARCHAR,; T, E* q9 N1 I8 U: o* K
- Title VARCHAR,- a$ Y4 ?4 W\" Q7 c9 X# R0 G
- Author VARCHAR
- ) g\" r4 Q: |& I% h2 H( A5 ?; T )''')
- 3 ^: ?7 g6 b S, e' Q& I conn.commit()
- 6 r% U# C T. m4 r( x4 {/ S- s* s2 w5 O3 t
- # --------------------Split Line--------------------# T l) ` t6 J) t. S5 i$ V
- headers = {7 }7 S# W) X2 h\" b+ `0 Q
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36") ]9 E/ c8 d/ F\" G7 J
- } J9 U3 x% U0 k8 `2 B6 v X
- : n& M2 p9 t4 P& |0 G7 Y
- for i in range(1, 1046):
- ; p4 n9 ~1 ?% l# H url = "http://xxx/index_%s.html" % str(i)
- 5 f: Y4 h8 k\" F E% E3 d req = requests.get(url=url, headers=headers)/ a7 x* w3 j) B0 E& ?
- req.encoding = "utf-8"
- 7 @0 F/ [3 T) p html = BeautifulSoup(req.text, "lxml")
- 1 L/ |2 n! b0 y) H V7 W3 x
- + D7 Z8 W4 a\" w$ C& ]# C # --------------------Split Line--------------------
- 6 o% S: A5 v7 b/ j4 ?; M% Y for div in html.find_all('div', class_='loop'):
- ( R4 q' V& `1 t content_body = div.select('h2 > a')[0]& O2 ^/ i8 L+ Q* Y5 i
- content_infor = div.select('.content_infor > span:nth-child(3)')[0]\" f# a9 i/ P. }7 H( F
- 5 K% L& Q9 H2 J* t, a # --------------------Split Line--------------------
- Q7 E% G5 I1 K! V: O cursor = c.execute( L$ P4 r) I& d7 a5 i2 n
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))+ X9 t* ?0 K x: g H0 D
- len = 0
- 0 V; F- r0 [1 o: m! x: l% |2 G+ C for row in cursor:3 `& X- e: L! e
- len = row[0]+ {8 f4 R5 O, g
- if len > 0:\" d1 b* h) C* m$ `\" t5 L1 i6 o' k, S: ]
- continue
- 3 Q( `: {6 N+ Q' {2 A1 o' ?; D( ^( G ^
- # --------------------Split Line--------------------5 G1 l$ c2 s( O4 Y0 `, G7 T2 e
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- # E9 r- T\" M1 s9 L* d. U "http://xxx" + content_body.get('href'),9 Y# F& \! a% G/ |
- escape(content_body.get('title').replace("\"", "\"\"")),
- ) @! G1 j9 E8 p5 ]) N content_infor.text.replace('xxx: ', '')))
- 8 w6 K1 f8 X/ P8 o6 O: a1 F5 Y9 m) b# T1 B- P* R8 E
- conn.commit()9 }' ?\" I# b# V* p& a0 A3 O$ z5 u
- print("第%s页" % str(i))
- 3 v\" O6 J3 q v$ I3 Z. N7 o# D, d* X) ^
- # --------------------Split Line--------------------9 q+ M( R9 @4 [\" [1 z
- conn.close()8 O, T! j& k$ S. |! ^5 z; V
5 g0 T' H. \* L0 q+ ]7 z
5 X4 G/ {% {" {) HPython交流群:1047602540
; i" m" X) w+ Y2 L7 V |
zan
|