- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- 7 M. ]& A2 w0 \& X+ v- N$ t# -*- coding: utf-8 -*-
- \" Z6 A( R3 q! R8 |9 W- v+ ]9 ^0 U, [0 Q8 J! ~8 Q
- import sqlite3/ p\" M2 f- B# @' \7 u' j
- import requests5 i- S/ K; ?& N! I# Q! [
- from bs4 import BeautifulSoup
- ( N0 H; x: Q* w$ wfrom re import escape4 N7 u t# t0 j q& ~5 c2 S% e
- - h\" o/ w+ @& k9 D7 Qif __name__ == '__main__':
- - X0 E\" \5 I x4 q) o8 ^ \: X conn = sqlite3.connect('Python.db')8 h) Y- g: {2 ?% J& w, ~+ c
- c = conn.cursor()
- ! [1 d( J1 d0 n X$ b0 `6 _+ y c.execute('''CREATE TABLE IF NOT EXISTS Python (
- $ z$ ?; p# Y7 w# r( `# O Url VARCHAR,
- * o' p6 h9 F5 O) I- Y; j0 |2 ] Title VARCHAR,
- - l\" S% P; q9 C1 | Author VARCHAR1 \1 h% Q' z; O; [+ x& E
- )''')
- ' ^9 r4 a, B- A* [, Z, Q( q. R conn.commit()' n1 T% B3 m4 F* N
- 8 n1 e$ c0 \& c # --------------------Split Line--------------------- ?! Q( Q* [% d+ a
- headers = {/ z! o9 y8 a/ t2 p
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
- , U0 W5 t* _3 ~+ l* j7 ^ }
- # l5 h, Z$ k, o L, G- I( h# w2 t1 b\" d' z& m/ L
- for i in range(1, 1046):* z- R7 ?: P1 D! e) j0 a1 j! Q
- url = "http://xxx/index_%s.html" % str(i)
- ! G9 j: B' x4 \) P3 A* s) f req = requests.get(url=url, headers=headers)8 P6 C$ w4 e! }0 ?, S* q A
- req.encoding = "utf-8"/ W* |$ m& z3 l# K P
- html = BeautifulSoup(req.text, "lxml")
- , c$ c0 r& ?$ q3 e, }# o
- : G4 G% Z9 V6 R' j4 I/ a # --------------------Split Line--------------------8 v8 a( I6 W3 O, u* ^
- for div in html.find_all('div', class_='loop'):9 q2 i& m j$ D9 c\" C4 d4 k( q
- content_body = div.select('h2 > a')[0]
- % @' Q0 X7 E% I1 b content_infor = div.select('.content_infor > span:nth-child(3)')[0], @5 Q. L6 R1 i
- \" ~# U6 y+ X- y1 p, y: D& h; } # --------------------Split Line--------------------
- \" a! L, l' f9 a4 S9 w/ W5 H cursor = c.execute(8 ~& M! E: Q9 e4 q8 l) [
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))3 y, c; k9 ]& d! ^% f) Y- l
- len = 04 B9 L3 I/ Y! `* Z3 z. G
- for row in cursor:- s0 s& m- m6 `
- len = row[0]
- 4 C/ {5 q& c1 |0 V1 |) w' x if len > 0:+ m0 R2 ?( S' v+ ^5 v
- continue( \3 l$ n' x4 P, M- w- k
- & Z8 a0 M/ `, K# C% g9 K, O' V # --------------------Split Line--------------------$ W' E: U/ @8 X& J
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- * S1 `; r) v3 q+ g/ O "http://xxx" + content_body.get('href'),/ l5 D\" P. o! X9 v
- escape(content_body.get('title').replace("\"", "\"\"")),2 d\" c9 E7 E; Z. g3 c, A& k) O( e
- content_infor.text.replace('xxx: ', '')))% b- v' r4 ?! a
- 4 }+ m/ a) a, K/ R conn.commit()
- 2 z7 Y4 h( T) U print("第%s页" % str(i))
- \" ^( z& F4 E0 T/ _* w* T3 Z& {, j\" X! s- h- E7 _& y! {
- # --------------------Split Line--------------------% D: S! O, B& M$ J* `, P( J\" C
- conn.close()
- & K7 Y5 ^8 v7 }
, X1 q8 H2 N# h, x: \( R# S: y+ F
. w: S! m* t) z3 k. B9 w" _; @! J
Python交流群:1047602540
# C4 O* ~! M0 ^8 j |
zan
|