- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python! b: ]& ?& ~\" a# O
- # -*- coding: utf-8 -*-
- ) [* h/ Q. ?) W, p+ V; a' e( K7 }# @\" n& X0 ~. M& a+ @* i
- import requests; _$ e9 d# z1 [3 T6 r
- from lxml import etree! ^/ F2 M5 k4 ~8 R
- import sqlite3
- , o& m5 Z, u; P: z* }8 a
- 2 F7 r6 R7 m+ W! u2 Z C5 N: c; L5 Q, ]/ d
- def write_sql(c, text):
- v; E' f$ M& Q, f3 D html = etree.HTML(text)
- 8 @: {% L b- a\" N # 标题3 }( N4 t! D7 D% v, h
- titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
- 1 k& Y' G' f$ p # 链接
- / L0 q4 X+ K! T/ C0 w hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')
- 0 U\" k& Q3 y7 d5 N: w: K # 日期3 p7 w. E7 h/ ]- N
- ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')$ L\" a% }. ~+ g8 C
- 8 R+ F6 P, G1 o number = 0
- - x4 B. f; R2 V; k$ D for title, href, em in zip(titles, hrefs, ems):5 e# q. ]/ L# Z7 K, u: O3 R4 o6 }3 e$ |
- href = host + href
- ; U. b7 }! j' b% w cursor = c.execute(
- . B' @# [2 s a# R "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)9 f8 s% ^& ], e( N
- res = c.fetchall()
- % g\" P, G) h) d\" K+ o\" Y/ i # 判断该字段是否已存在
- ) O! M, X; J$ s/ A& i if res[0][0] > 0:2 @1 z* }; n9 n\" {
- continue% r6 [; K. k/ F; ]/ D3 W& |- L
- 2 h5 E; C# Y6 ^6 G9 B c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (7 W/ S3 z9 v. i0 c3 P4 F
- href,$ s6 _7 [/ j. W) b# N( L
- title.replace("\"", "\"\""),\" Q0 U+ _/ G# |1 [
- em))
- ; l1 R% }* q; C1 e4 ]3 [ number += 1. ^' G) {# t6 e\" d7 l# [* y
- print(title, href, em)/ |3 P+ v- r8 J' G( Q; P
- 7 M: Y$ q- P) U$ S2 M& T U9 u conn.commit()/ w\" r' E9 j4 Z, w\" |; i9 r
- return number > 0
- 7 l1 H% z9 C+ S1 g) [: |6 J$ x( E% j
- 1 H! Z% L7 \ V
- / k5 o* b2 t& G4 n- C5 M- Mif __name__ == '__main__':
- * I1 G/ D# t {8 j0 Y9 Y1 i+ Z8 j\" h+ k) K. M# f0 [' H f s0 O8 {
- conn = sqlite3.connect("Python-xxx.db")' w- |\" j9 ^$ G8 i& O) j
- c = conn.cursor()7 u+ U* x; e- Y
- c.execute('''CREATE TABLE IF NOT EXISTS Python (, t- ^2 D T$ X! s P
- Url VARCHAR,$ G/ `. W4 b9 C
- Title VARCHAR,
- ( Q; b! C. A7 i: U% ]) E# n Author VARCHAR
- / |' i5 R) }( V2 g6 v1 d, q )''')& j7 S$ D+ g& h4 u8 f\" v$ q\" @
- conn.commit()
- 9 p. n/ |; x- p- P& A$ O% ?4 N
- + h& D' ^( h' I9 S host = "https://xxx"0 p' Y; Y# u# Y1 y' d
- url = host + "/xxx"
- 2 R9 A\" N: E J3 P j4 n req = requests.get(url)
- % }6 K2 _4 ]7 C3 N7 a req.encoding = 'utf-8': q. W& l0 g2 j: z* L& i
- # print(req.text)/ U' a; k# g q+ U1 t# H
- + U5 H d Q9 x; s; S5 I1 q
- html = etree.HTML(req.text)0 H0 n+ A s2 S) F\" L
- clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()') E* `8 l7 b# F' Z; S8 v( a
- hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href'): O/ U* O9 K8 t3 E7 z: W* \
- # print(clearfix, href)) e& E% O& g1 M
- / v# s2 s\" f; k\" \% E
- for clearfix, href in zip(clearfixs, hrefs):: m0 Q, Z9 @0 I/ k\" D
- print(clearfix, host + href)7 K6 P7 z- Z! G7 y- `\" Q\" H
- \& P) o2 y9 d) m\" {; B page = 1
- - Q) n ~9 n K; j' Q+ {9 O5 Y while True:
- 1 l2 B& u; C0 O) e3 N! [1 L url = host + href + "/list_%s.html" % page
- * j9 i2 F. [\" O; S' o req = requests.get(url)
- ' d3 s: ^* c; K$ Y; v req.encoding = 'utf-8'7 m8 `5 D1 ?! f( a
- 0 Z2 \. h1 c- [3 t; c, I1 @, Y if (not write_sql(c, req.text)):
- Y& W\" e4 x$ j% S* Z break; u* \1 z$ x' f' T
- / H6 c% j: @. \3 x# z print("第%s页" % page)
- ) l) ~2 F4 v$ E4 e5 s page += 1
- ' Z* y4 s0 _. s. l' Q( b
- & C0 x$ b e, @( ]. w conn.close()
- # F Y4 D. d8 K\" ~, ~) J
0 Z, @4 P. X% C! F, z# \
xpath用着就是舒服~& Z" _7 C: [- y# }4 v& _; q' P: l
* e/ s6 C; l* |% t' P7 APython交流群:1047602540
9 Y7 [$ \% s g& U. G8 S& a, c% N8 Q2 k' Z* ] b
0 B9 y. s2 ^+ O" q; F |
zan
|