- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- % ]; I d\" ?3 p, s# -*- coding: utf-8 -*-
- - o/ R( m# v% v5 i+ M
- % y h' f v: c$ z7 T9 Simport sqlite31 N! ~: @\" c3 M7 j& w; C; ~
- import requests
- + I5 k0 S0 E# X( L* q0 kfrom bs4 import BeautifulSoup
- 9 |4 t$ E\" j! ]7 G, T. p% efrom re import escape6 G: p& e) I6 j
- 9 b# [& M\" A; C2 \' W0 S& fif __name__ == '__main__':
- + j2 X1 {: S& Q conn = sqlite3.connect('Python.db')% u; ]: s/ X! e, \' G
- c = conn.cursor()
- 0 }6 m, K. Z6 Y c.execute('''CREATE TABLE IF NOT EXISTS Python (
- & U. E7 ^3 a7 }& i# A4 x\" Y, L! c Url VARCHAR,
- `! ^' T$ u& G8 j/ c Title VARCHAR,
- 2 U- M+ Y* G! l; w9 Z9 R Author VARCHAR
- 8 D/ ?% n; B1 n/ ~6 ~- B )''')6 i E7 U: V7 j
- conn.commit()7 j1 k1 f+ I4 P3 |
- P/ R* u) ^& z m4 P4 v # --------------------Split Line--------------------
- $ Z6 Y$ K/ w+ O Q; q$ N: ^ headers = {\" T+ {6 ] f. h% F
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
- ) |8 M/ }\" i2 E! @ }5 h& `2 y2 c4 V {5 M! J( {0 W/ h
- ; f5 h% O7 k; J r\" \& t* \
- for i in range(1, 1046):& m5 ~* S2 J, L7 Z6 i
- url = "http://xxx/index_%s.html" % str(i)( U, D, p- ], u9 b; T
- req = requests.get(url=url, headers=headers)
- 8 D( E% B2 O$ R& s; X req.encoding = "utf-8"- d7 _/ Y- m0 P: C% B' l9 K% F: r
- html = BeautifulSoup(req.text, "lxml")7 {; E( q2 C; g0 \4 [. v1 _9 N
- . X7 B: s V4 |7 S; m+ s4 G6 z
- # --------------------Split Line--------------------
- + i7 ?, i& D, b8 `) Y! k for div in html.find_all('div', class_='loop'):
- 9 B$ a8 c7 e# f+ y b; M3 a content_body = div.select('h2 > a')[0]* Z2 G' Y; K/ `3 c' E) e
- content_infor = div.select('.content_infor > span:nth-child(3)')[0]
- : w8 P# ` _ o( X; `! M1 q9 e! P; [( j7 I$ Y% z
- # --------------------Split Line--------------------) J l4 w/ c9 T# i2 t) P, l7 m
- cursor = c.execute(& N( o) ?\" k. r5 h# s) B
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href'))). t( T2 X\" Z8 w3 P# T+ @8 M! m
- len = 05 \7 \* }) I( }1 p
- for row in cursor:& D% c3 t7 z# { ?$ R _0 y6 O
- len = row[0]
- ) R4 J$ G5 M0 x# S7 Q if len > 0:9 |$ c% D- h8 f6 U
- continue/ E' ^% e\" i$ ^0 N/ J2 ]8 L
- 6 K0 @, M9 n; k3 H. F4 v' u
- # --------------------Split Line--------------------% C- O, H: V' Y4 {0 m8 L, P
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (9 z4 v! x* h8 {! d: q
- "http://xxx" + content_body.get('href'),% ]! x3 m* B* b* q) m; e3 l
- escape(content_body.get('title').replace("\"", "\"\"")),5 V3 J& s* q* C
- content_infor.text.replace('xxx: ', '')))
- r2 w* n0 N2 R; N+ x& f$ q8 C4 E# ^0 V# H9 I
- conn.commit()\" P3 g; @) L D0 X, K+ K' l% t
- print("第%s页" % str(i)) y5 i0 X! L/ U, M6 |) e8 o
- + w1 w, X$ J8 J\" M7 T6 A( `
- # --------------------Split Line--------------------
- 9 F8 m' v3 _5 @4 X1 W: u% w conn.close()8 v* N* f' I* j
! }3 Z. a/ Q) m% d5 ^- |6 i
8 k/ u; v& {3 I6 x2 r3 LPython交流群:1047602540
2 [/ o( U' w5 S( [" B. G. S% p |
zan
|