- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- 4 o1 z3 X* ]9 [1 t) n# -*- coding: utf-8 -*-
- 3 }: j+ _; [- @& S. P8 f
- 1 \( n: K' ]& b. S( ^* E( L4 gimport requests6 `+ q2 o Y9 I# N
- from lxml import etree7 p3 i& I1 L) N# A
- import sqlite3/ c3 K! [: u [$ I
- A\" I* M* y$ U- D- P9 S
- 5 a. h8 z% F. G/ v\" Odef write_sql(c, text):
- 9 Q8 m3 w) i& O4 u html = etree.HTML(text)
- 9 E0 ]2 ]7 ` ^3 t: N3 Q S # 标题% _/ N\" K$ _* l) ?
- titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')+ t3 u: f/ g3 G) B: \
- # 链接' B, a& b# e) U0 W0 v
- hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')# W4 g' k, B) c! a, o
- # 日期0 b0 F* S5 g3 w/ x+ q3 _
- ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')
- 5 P: J, w\" i( m( F\" g
- # w' n6 W- D/ Z, C R5 W\" P! t number = 0( H) p' @$ s1 I& H; ~/ ]
- for title, href, em in zip(titles, hrefs, ems):& a) N& ]# g. |/ n) k\" P
- href = host + href! l! [; H3 V\" v
- cursor = c.execute(3 z$ b7 @' G% B. X% G( n, R) |
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)
- 9 K' Q& I m d x res = c.fetchall()( @+ q& r4 a/ d
- # 判断该字段是否已存在) t8 N1 l: Q# [+ z5 B: [) v
- if res[0][0] > 0:# S7 q+ h+ t% z% X. X, |& N
- continue
- . @1 ]9 M5 q\" K1 u# V+ s( F4 N$ b! _. e
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- $ c- r$ v' C! w href,
- . d a7 b( [5 @0 p7 D! A% C title.replace("\"", "\"\""),
- ( t6 W9 S: t+ ^! d4 N\" S1 \ em))
- ( ?6 i4 I; L b2 L number += 1
- ) c7 d6 P( v6 o: H6 p5 h print(title, href, em)# k( G5 U6 O) _' v; r
- E# T' _% h. N
- conn.commit()
- # Z5 k( R' G C9 y5 i return number > 0
- 2 i3 R4 B8 N! y) C
- \" Q; y. v6 p6 V: T
- ; s- \& m% r( Q$ R: ^/ e% m& iif __name__ == '__main__':' n8 K% i; D7 q# C! I! g+ M
- 1 i& K- X7 T7 Q- U* H; F
- conn = sqlite3.connect("Python-xxx.db")
- & }# R, e7 f( Q: @9 N4 K c = conn.cursor()
- 6 Y! ]- P; k: Z. @/ } c.execute('''CREATE TABLE IF NOT EXISTS Python (0 k! e1 Z+ {7 D2 Q4 Q5 t5 l4 [
- Url VARCHAR,
- 4 d4 y* o3 T% S1 B3 d( @ u6 G# R Title VARCHAR,
- * t8 h8 A2 k- U# j4 s Author VARCHAR
- : i# A L4 P; I5 b! O; ^& p! N/ l )''')
- ) ]( g9 k6 C2 ^. B conn.commit()
- , T: {6 A, c: h+ T2 Q/ ?* y! }( [) C3 U
- host = "https://xxx"- j8 V& f T9 R$ ? R4 `/ q
- url = host + "/xxx"( e9 P1 F) _\" B! t6 T8 j; r
- req = requests.get(url)& Y g; `* m' @1 j/ w
- req.encoding = 'utf-8'1 x9 T) P. ?* J$ J; {2 |8 w2 c/ l
- # print(req.text)# o9 x; R% D# A) S
- * | F4 l' h2 v\" f html = etree.HTML(req.text)7 T+ z+ `8 C# b3 Z& v {1 H( _
- clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')
- + Z1 b( h3 ^\" C6 J3 }# E7 Z- v% [) f hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')
- 0 g1 ]: Y; _+ T: D1 j # print(clearfix, href)
- + H* U8 v1 K4 V# _$ {7 [; l3 p8 `( V' p* S: z% f `
- for clearfix, href in zip(clearfixs, hrefs):6 ` x2 g7 s, D7 Q
- print(clearfix, host + href)
- 7 A8 L# m% ` i# e. `2 X! K8 L9 O n6 q
- page = 1
- 2 y/ j6 U! n3 l1 H4 h while True:
- $ \4 Y$ y* J$ u: r url = host + href + "/list_%s.html" % page
- . g% [/ h9 i( P1 W8 ^4 z req = requests.get(url)
- 0 f/ W1 }+ y, [\" L req.encoding = 'utf-8'+ |# `6 n) A6 M, G, ~\" B
- : N: T) F8 r\" j7 ^+ ]; z
- if (not write_sql(c, req.text)):2 O$ A( a% y! T0 V5 n
- break J- Z' n8 F+ W1 Z/ I1 }# u
- ( [6 D. p8 s |' K* u print("第%s页" % page)' T7 h5 \* F/ Q$ a, u' m
- page += 1
- , V\" O8 D+ f/ |& f O, ~
- \" V# o4 W) |# X0 H3 l Q8 F conn.close()7 Y6 N, k) v/ o6 t9 v
, d) S5 \" C# \8 F1 e& pxpath用着就是舒服~8 ^4 o) T8 t% R' S
) I( y, J: x+ W
Python交流群:1047602540 7 E; ~1 q- d! I, e+ O( v
9 Q8 c* y: _3 d8 G8 [& g X) g
1 q7 K& [( r* w5 Q* h3 ? |
zan
|