- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- ( ~6 }3 u0 H\" o\" ] L# -*- coding: utf-8 -*-/ k5 c* C+ I* S' t\" J$ @
- 3 _5 O* m3 B0 `
- import requests) b; N/ M) V. y\" }( ]
- from lxml import etree
- 7 E) `6 u& w' oimport sqlite3
- . A2 [9 Y$ e- [, h$ f; ^# {, q/ W' M4 e+ ?
- M0 e2 l( T+ W6 m8 L
- def write_sql(c, text):
- 0 z$ U a. @ M5 z n# ` html = etree.HTML(text)
- 1 P; T0 C4 X, q) A, l # 标题
- & g, l& M4 a/ ~1 ` titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
- 9 E\" U2 e* O. j m* r # 链接* ], R6 o$ d# z# q& t
- hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')
- + x\" C X: K; I' z, m # 日期3 ~6 i, V! `1 M5 {/ U
- ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')
- ( z0 g. Y( i' C7 h3 U- x2 z0 |2 @/ V
- number = 0( ]. @: m4 G$ B4 r# Y' _4 F
- for title, href, em in zip(titles, hrefs, ems):
- ; h2 \- Q, F0 D href = host + href
- ' ~5 g. M- T5 o0 g* k5 L cursor = c.execute(* |7 |( }9 C7 g$ h t, C
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)
- / }1 w- j5 o+ |9 m9 O res = c.fetchall()
- z\" i, c% @9 F# A; { # 判断该字段是否已存在
- 0 I4 n. K( n* @3 P if res[0][0] > 0:; g) M9 U& W2 s
- continue0 g9 \6 C\" o, r+ t; R! R
- ; N6 @6 h/ Z2 x: A# D; H+ ?1 J! M
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (, d/ Y7 F( [( i6 `
- href,
- 5 E( W, G' R L& E: C$ v% @ title.replace("\"", "\"\""),! z- @ N, i1 c. d) H5 t
- em))
- , l/ t6 b& H\" E+ T\" w number += 1
- 7 |2 h- H2 r. v print(title, href, em); S$ o8 V+ K; w% H) Q2 k
- 8 ?. t. a$ i\" M$ X
- conn.commit()* g3 D U$ D: ~ G& q6 T7 D- [; ?
- return number > 02 i\" n2 a: j s u) u
- $ R: {$ I6 w/ Z/ X% s% `; y
- 1 J( E# S# B\" s; Iif __name__ == '__main__':
- # A\" R- n7 e3 N0 B/ N
- ( X0 m$ _! M; V$ g- X conn = sqlite3.connect("Python-xxx.db")/ a1 Z$ b0 s) |7 s\" \8 t! \
- c = conn.cursor()
- B/ X/ { y' q1 _' J c.execute('''CREATE TABLE IF NOT EXISTS Python (
- , C- J1 @. z$ I `& l\" @ Url VARCHAR,: }* J) g) l0 [3 u
- Title VARCHAR,
- 2 F; C& v5 }7 ?- H5 F e* j Author VARCHAR
- 5 ^! P* ?9 F+ B )''')6 O. g1 p& l3 z, ^2 N
- conn.commit()
- % y' h' s, ~( O6 C. P
- & o/ Q9 m, f( }2 Z host = "https://xxx"0 J, {0 }+ @6 Q4 K& N
- url = host + "/xxx"
- 2 j' l S) d6 |% A8 O5 u req = requests.get(url)1 `% ?) a' U* R( j9 L$ L# ~* h
- req.encoding = 'utf-8'
- 6 [, M/ B$ D1 F7 e0 q # print(req.text)0 D6 Y) x, y6 ]0 q
- 8 z% n- W6 G/ m0 S4 E+ d4 b7 o
- html = etree.HTML(req.text)0 x8 N7 D; @0 T6 x4 e8 I2 } c! E
- clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')( N6 _0 o9 i# W
- hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')
- \" @0 h P- @# C0 B+ t. d/ D # print(clearfix, href)
- % o0 X: r0 h5 y\" {1 j. _, [9 M( C& _, o5 T% n( B
- for clearfix, href in zip(clearfixs, hrefs):7 L5 v' J2 K$ Z8 ]
- print(clearfix, host + href)
- ! o% Q' p! D7 [6 q
- ( ]% K; D, T& ^- y- x page = 17 {' L& k6 L9 F9 ^2 D5 O- u! R2 P
- while True:: R7 z% o# t4 `
- url = host + href + "/list_%s.html" % page
- 3 j8 D\" K4 [7 [3 |1 X% ~2 p req = requests.get(url)4 A3 O. J2 [+ X6 R' N
- req.encoding = 'utf-8'
- ( }# i* I$ }3 m& y. n: w* d# \8 \ |& a, H8 f7 S8 k3 a
- if (not write_sql(c, req.text)):
- $ e/ |7 T# a( x; v7 { break
- 3 ^! Q- |: e( j1 [6 o' ~- q( [$ {- J% Y
- print("第%s页" % page)* F; \6 l; e/ H$ a
- page += 1
- # ^8 m3 I, E; I$ x; m2 h) U: O- s2 ~8 I: r
- conn.close()6 X, ]\" }! f- ~ V\" L' U7 p* Q$ D
+ M5 W- g* Y4 W( N) p4 _" }7 xxpath用着就是舒服~
% V4 @# Y% s" Z4 O2 \& H B
3 p# o& h( b" D' I/ H5 MPython交流群:1047602540
- T2 T2 u/ b/ `" U
% f8 {/ m7 q( ~6 P( r6 c
* V1 u5 ^2 l2 O+ P" M |
zan
|