- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- $ C6 K' [) s* A0 g+ U5 q& T' K( M6 B& I: ^# -*- coding: utf-8 -*-% i( N1 y0 r9 c# O. b3 ^- p
- ; b+ e( C p7 Oimport sqlite3
- 2 t; T% Y+ [2 Vimport requests
- ! Q8 T' w8 \! M# Rfrom bs4 import BeautifulSoup( s0 ]& t/ [# z$ b* |8 P
- from re import escape
- 9 H5 E! r3 o7 v3 B. s1 E/ D z! v& B# v
- if __name__ == '__main__':
- % U- ~0 W2 u5 [' P3 U& W conn = sqlite3.connect('Python.db')
- \" O u, \8 q% L\" P c = conn.cursor()% U' S; K7 }$ \* g/ B- O
- c.execute('''CREATE TABLE IF NOT EXISTS Python (9 a8 E& S8 W% c# B7 J! e# t+ K& k! f
- Url VARCHAR,
- k: S8 k L t Title VARCHAR,5 F, d: ]% o9 _+ p2 O
- Author VARCHAR- o$ @! k/ e9 u3 S# S
- )'''), G# k( p+ R* z
- conn.commit(). q7 e$ m3 O' r$ ]! @
- . O5 X5 U3 ^5 n4 ? X$ a% j5 Q # --------------------Split Line--------------------
- ) f9 z5 I/ u# v: O headers = {
- 8 }3 Q `* `) w8 ? "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36": v, B$ L\" z l% w0 f; i
- }4 B& l# e$ D3 ]\" j) T0 r, K
- 1 S, l, m+ [\" J, B, g0 s* j% @ for i in range(1, 1046):
- : j% ` j2 J5 p2 s. x9 h' S url = "http://xxx/index_%s.html" % str(i)+ u; j* ?% g V( @9 [5 ~; }) l
- req = requests.get(url=url, headers=headers)
- 5 y4 _7 d% g+ y1 k( O7 M req.encoding = "utf-8"6 o' g! _( S7 O6 _7 H. G
- html = BeautifulSoup(req.text, "lxml")
- 7 d5 G\" {/ z+ L& ^, q; a3 E8 R
- ' R& u. c. N9 y\" u: k: O4 P9 e # --------------------Split Line--------------------
- 7 R. A$ _5 l# Y8 x: c# I% t for div in html.find_all('div', class_='loop'):
- 3 ? R3 f( J H! y6 t content_body = div.select('h2 > a')[0]. t9 \9 }' g- d1 L
- content_infor = div.select('.content_infor > span:nth-child(3)')[0]0 X( i# \! K: i( z9 b
- * I7 G5 [$ P5 h
- # --------------------Split Line--------------------9 r$ c- G+ h. Z- U' r
- cursor = c.execute(
- * _. X4 X, t5 B' E "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))
- \" v& D; @) j* [6 T len = 0
- & a! @! E3 W5 V, S) I4 l$ X, ~ for row in cursor:; y/ ^! [# x' s# d& H7 }\" O; x
- len = row[0]% S; o2 u1 \2 }7 K# I D7 H
- if len > 0:
- 3 d9 T+ l' g! O0 O6 y0 i9 I7 o$ p) G continue# \' R9 ]. _/ B5 _6 O8 r
- 0 l9 j0 g- B: ~/ y' Y
- # --------------------Split Line--------------------
- . z- D0 q7 Q0 [ W7 F- J' r c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (\" j3 o9 l5 Y. f7 D' r! |4 ]9 m
- "http://xxx" + content_body.get('href'),
- 4 H3 B( F& B% H1 t/ c escape(content_body.get('title').replace("\"", "\"\"")),7 T0 W6 a) d6 B$ V! z1 _3 z
- content_infor.text.replace('xxx: ', '')))
- 4 V* H( `9 ^+ F% T t0 g8 }
- # G# J1 v# d' P% U0 k N) g conn.commit()
- 5 t# {% |6 e5 c' u/ ` print("第%s页" % str(i))
- ' \\" P/ Y0 Z7 i5 p- e7 `8 G }/ E/ J0 k6 e6 G
- # --------------------Split Line--------------------. O: D- _+ e L3 ] h; o5 [
- conn.close()' G9 G; T0 o' Q6 n! R
! X" m8 f& |+ X# Q! W u! o2 a$ C
" e5 z& ?! I' B V3 W! m. NPython交流群:1047602540
# l% O, j. B3 k+ k- H6 a |
zan
|