- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python, d. Y& l, a0 k A; a8 u; P
- # -*- coding: utf-8 -*-* U4 \. E! B\" q/ v$ p\" l
- # X. L* j, I a k9 b\" w/ k3 R
- import sqlite3
- . Q5 |+ n% @/ K\" D& o& Vimport requests- _8 o3 h+ m! r+ V
- from bs4 import BeautifulSoup7 V$ x- _! O' a3 h
- from re import escape
- 4 L7 @' Q7 F8 _7 w9 B: t# D
- 7 B\" H; L Z6 vif __name__ == '__main__':
- ) T2 ~, @* k: m! J- N Q\" I conn = sqlite3.connect('Python.db')1 S) N2 d7 p j# n. {, C& L% D0 u% i
- c = conn.cursor()9 j8 M! A5 ]8 j7 f! C- l
- c.execute('''CREATE TABLE IF NOT EXISTS Python (
- 7 T/ w* j, [3 T Url VARCHAR,
- ' O( s- [, Y% J& ^ Title VARCHAR,
- . x0 a6 Q9 M% i Author VARCHAR
- 0 q0 c# A p. I5 M )''')+ f: h\" p6 G& y9 p
- conn.commit()
- ?4 u7 v: X( b3 q
- : a, W* e( q, b3 u0 F: L # --------------------Split Line--------------------
- . h1 K; \- E! z) n/ E( a7 x headers = {
- 5 c0 O# q2 U& ]* X, e "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"/ a- T, s/ g# y8 `5 Y% O( o
- }
- ' c8 v0 R' j7 |& G
- + e0 k7 H) r. |2 c9 P. q0 a\" @. y for i in range(1, 1046):0 m$ }\" U( @ a+ y- T0 K2 L1 {+ W+ i
- url = "http://xxx/index_%s.html" % str(i)
- + n K, k6 ?& [ req = requests.get(url=url, headers=headers)
- . N( ]. `8 j; O: Z3 e8 h& ?1 _\" ] req.encoding = "utf-8"
- 4 g* ]# a' o\" s1 G; v. m html = BeautifulSoup(req.text, "lxml")) v% G& B E9 ^& B
- ! B* P1 |* i: I( f* ? # --------------------Split Line--------------------' i. M: s2 c; I9 d% h
- for div in html.find_all('div', class_='loop'):9 v0 N# t$ G9 ?
- content_body = div.select('h2 > a')[0]& p4 K3 S' t7 j2 ^0 E& \
- content_infor = div.select('.content_infor > span:nth-child(3)')[0]
- : z4 ~9 b2 {, ] n* t2 w# w' O3 E W\" s4 i& C. W
- # --------------------Split Line--------------------0 r) O. L- H: ?7 }7 T. K
- cursor = c.execute(
- - @1 C7 `9 t# o6 o$ n1 C( }1 U: M "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))
- : M' H: N% u5 S$ r3 H* T# n, }' s% u len = 09 K( d2 b: e9 z7 P8 |# D. ?. f
- for row in cursor:
- R1 K7 E m6 ]/ C1 Y len = row[0], d6 r. R1 u8 Q) }
- if len > 0:( H$ a8 ^9 W \& j2 @6 e\" |
- continue
- 5 {: ?1 ?0 I+ w+ d, l! Q
- * K+ X, J( I) z: B+ a' h # --------------------Split Line--------------------5 @ |% ?0 p( ? j- y; C
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % ($ X+ J/ g6 X) D* [1 Q
- "http://xxx" + content_body.get('href'),
- 1 ?$ T/ A! M7 j- F: Q+ i& j( { escape(content_body.get('title').replace("\"", "\"\"")),
- ( v& z+ p+ y; K content_infor.text.replace('xxx: ', '')))
- ' q1 S8 l6 f. @0 T4 C- Z8 s a( w, J! q9 _* u
- conn.commit()
- / g3 v! \+ ]& q' N7 y print("第%s页" % str(i))
- 5 p8 U: ?' u' M, t( o' _, _
- 6 y! M) H! E1 {) ?1 p # --------------------Split Line--------------------
- \" c, Z4 n. J5 ` conn.close(): N% z% \( l1 f4 W
/ x, B. w2 M8 i3 w' ?4 G, v
( ]6 I9 v2 W3 pPython交流群:1047602540 " _6 ]0 _5 l* D, Q) K1 V% S4 e
|
zan
|