- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- ) |; A6 C( o9 \0 \# -*- coding: utf-8 -*-
- 7 F L5 m0 m9 x. {6 I) L9 S
- 1 t: z* p# U5 d$ aimport sqlite3
- ' I& u. J+ P+ S& T+ l$ ?import requests& ^8 I( v q. {( }; J+ s
- from bs4 import BeautifulSoup
- . s9 Z3 ~+ R$ K0 kfrom re import escape
- 7 n0 y5 j! I5 `3 d4 |- J! f
- ) w: ]' ]. ?& N; {0 s3 @( Bif __name__ == '__main__':! ]* V, s; s9 K, g* D
- conn = sqlite3.connect('Python.db')- }9 s E7 T% \# Q- W) n
- c = conn.cursor()% P# @0 U# @( W3 f. q
- c.execute('''CREATE TABLE IF NOT EXISTS Python (6 _6 V+ G G) B, H0 i ^
- Url VARCHAR,
- \" D+ X* S9 Q2 N' T Title VARCHAR,
- ! k0 d. Y$ d\" `7 N# c: T, u Author VARCHAR n$ @: s/ ?2 E: {' Q: T3 _* s2 z
- )''')
- 2 x( {/ h0 m/ g+ s\" k conn.commit()
- % `. \' \, i( V5 \\" \5 B/ g8 |: [$ w. x2 q7 N4 O5 B
- # --------------------Split Line--------------------
- ' [9 ?7 l1 F- T, L v headers = {5 W+ c ]% j! {; J
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
- 6 E/ B$ L2 q, s6 t\" y& q. w }
- ]0 I% i* Y3 V3 }
- 5 U# L N: q U for i in range(1, 1046):
- : l9 `% E\" q. ~9 u; i! S url = "http://xxx/index_%s.html" % str(i)
- ~# s0 O* F9 r, ~ req = requests.get(url=url, headers=headers)
- 9 A\" m6 N! K6 ~ req.encoding = "utf-8" Z3 N' M) Y! l; y9 a
- html = BeautifulSoup(req.text, "lxml")) z2 O& j! E9 V+ e0 ^
- 9 W6 A; k2 l8 E
- # --------------------Split Line--------------------
- $ Y! {6 h$ [# ~3 ~2 z( D7 z for div in html.find_all('div', class_='loop'):
- 0 n. O+ t0 m& v# J1 [& m9 q, U content_body = div.select('h2 > a')[0]\" w- P+ a& J: ?& d1 b0 N2 d( q9 U
- content_infor = div.select('.content_infor > span:nth-child(3)')[0]
- \" H q' s- F3 p; a- s+ o0 `! {2 t3 S: c5 f& c
- # --------------------Split Line--------------------
- 4 p5 {) M, o+ M cursor = c.execute(
- : e0 L* g) T3 {' l% l9 Q4 B5 p8 }' { "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))4 Y0 B5 O% u9 s2 m# q
- len = 0( e. A% E( g% ?5 z3 g8 q0 t9 B
- for row in cursor:+ N9 U4 w1 R0 G, y* s( I5 I: m) x/ m
- len = row[0]6 P- E6 Q* \! e% X/ g$ ?
- if len > 0:
- * E% f5 `0 x K4 n' N3 r: F0 ]! v8 W continue
- % ?, b( @, \0 |4 L9 P4 k: s L4 o, T
- # --------------------Split Line--------------------
- & \( j% k) ~; l3 Q2 A( L* }& W c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- \" e5 C8 @0 M# u% u) J: a1 b/ g "http://xxx" + content_body.get('href'),
- * K( q: _! G% ^0 b escape(content_body.get('title').replace("\"", "\"\"")),
- 7 D9 m$ P* I/ d6 u5 p5 e/ @# J% G& | content_infor.text.replace('xxx: ', '')))
- ! c, k7 h+ F- I) k; c* v0 i8 j
- 0 l) I1 b* z0 {) H- K9 U% f: | conn.commit()
- ! \% j* A' r\" w9 w4 [ h print("第%s页" % str(i))
- - R$ o( C! k7 ^, [. e5 d0 l+ a) b+ A% ]\" w1 ~
- # --------------------Split Line--------------------. g7 q9 d' T: U3 T
- conn.close()9 O3 ~; _( D1 \2 a; ~/ h9 ?
4 n p) L: D! B. Q8 v( j- B- p" m2 F ^! T
Python交流群:1047602540 " R8 u7 m* P0 @+ V$ P9 \1 V7 x' e
|
zan
|