- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python9 j4 N1 X2 h$ h) O9 p
- # -*- coding: utf-8 -*-
- 8 c( }5 R9 G7 }$ n\" ~3 G1 |$ `) V' ?9 N! q# B/ K
- import requests
- ; ]9 R* x; {2 l& n7 s# ~from lxml import etree
- ) ?- Q7 N# `, T1 J5 F- q6 d6 Y6 pimport sqlite3
- . s1 e7 j; r/ ? Y' U0 C/ f
- % p9 q( Y( G: J z% l
- # J7 k! g- P& S( s' |; @( j) j9 V3 Ndef write_sql(c, text):% @9 e `8 r# C$ j) V
- html = etree.HTML(text)7 E+ e8 e9 q9 X% C
- # 标题
- ; e, T( r\" C5 b( E% X1 V titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')0 `: w. I! N6 j$ U6 B7 ~9 o
- # 链接3 ?7 j5 \' z4 V# ?8 h7 e
- hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')8 w- g7 j, {* O+ z
- # 日期
- & |6 K( x' p\" b( Y! M\" J ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')
- ( ?; y- t# U' y\" f8 M8 A
- ( W/ s9 H9 a\" G! I) x! c4 `# ] number = 0& v6 \+ ]\" K' S' ^) f; r! i: N0 c0 E3 r
- for title, href, em in zip(titles, hrefs, ems):+ v% x: ~0 n `7 b7 j% D
- href = host + href1 I6 _7 E: l) O\" Y9 H, j
- cursor = c.execute() ]% V3 @# H* H% V; d* } V\" {
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)- I. D7 o. b! {2 |3 b( ^. i- p
- res = c.fetchall()# `; c! m1 ^9 b( {
- # 判断该字段是否已存在
- . ?& _- U/ y8 m! `* ^' g( Z if res[0][0] > 0:4 Q5 h% }* I+ o% u- w\" o) K W* ~
- continue
- 8 l) a( }+ l9 k% D! M l
- + G8 {% S' c6 j' E7 Q c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- 9 Q t8 j3 x, G\" u% _( h href,
- 7 G Y- T+ C( Q6 K( K7 v title.replace("\"", "\"\""),
- . ]; m) m3 r. \* R) z/ e6 g em)); |& b% r1 i! k' ?# `
- number += 18 c: Q6 V\" D! _# o
- print(title, href, em)4 U5 \9 {) v: P1 ^2 M
- ! W, \) \% S$ [ Y\" B% A
- conn.commit()
- 3 F, {3 S1 p$ q* Y4 \/ A- A |/ i: W% m3 \ return number > 0
- ( g$ e$ _$ d0 _' E* ]& [0 W\" ~) w$ K8 ^# ~0 {
- ) }5 r+ p+ i8 Q\" w9 [if __name__ == '__main__':- P4 K8 X0 }9 q X
- * E6 o# V* e0 O conn = sqlite3.connect("Python-xxx.db")
- * G. b q- u _2 }\" l5 h) Z c = conn.cursor()
- 0 a B1 m! q$ j& z c.execute('''CREATE TABLE IF NOT EXISTS Python (
- 3 Z5 u* e3 O$ b- T$ t' K Url VARCHAR,
- 6 B7 F& ?' ?8 U, Q( i& J9 s; X Title VARCHAR,
- % o4 ~1 R& m$ K Author VARCHAR
- 6 v9 G0 e; G l$ |* j0 @ )''')7 ~( x\" H% K# ]* t' p6 h; {3 a
- conn.commit()
- 2 |- H) N& A% m* Z* q- M& C8 \$ b! P. J! _1 j; q5 Y8 S7 J8 V
- host = "https://xxx"; ` Z! ]8 C, B/ w/ J
- url = host + "/xxx"0 h8 r4 @: q9 P
- req = requests.get(url)
- ! B' @4 K, d! a6 n: y req.encoding = 'utf-8'( n9 a4 y: g f9 k6 l, l
- # print(req.text)
- ' w, P* L\" z+ s& x; [9 q# n9 H5 d7 r
- html = etree.HTML(req.text)
- 6 b2 X4 B& T\" m4 W8 T) B% R$ C1 t\" l clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')$ V5 j% B1 `6 q
- hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href'): F2 ?$ n) n4 Y5 B r\" V
- # print(clearfix, href)
- ) ~1 [( \; p, R; H
- 7 g. V' M* b( d9 O\" Z! f for clearfix, href in zip(clearfixs, hrefs):* B+ e! @! C$ s5 \% D0 N) Q; P\" C
- print(clearfix, host + href)7 Z- o% U ^; Q3 r7 k2 e( m' w( b8 a* D
- ; A! T6 Y2 R P' N# f9 r3 t2 x page = 1- H9 O9 F# e+ X; C. }3 T
- while True:7 }* n( @/ s/ ^; m4 f- `2 R# w
- url = host + href + "/list_%s.html" % page
- & ~& E. A6 o1 Q3 B8 c* y% [ req = requests.get(url)
- 4 O3 L! X! X h req.encoding = 'utf-8'
- 6 H) r8 V! d$ q% {, `
- . E; `, B. c% O2 ]$ ?3 X9 w if (not write_sql(c, req.text)):
- * P# d$ i1 U5 H q break
- & f- \# @- j) c2 T* w& x( \7 ]- P
- print("第%s页" % page)
- 7 g0 |. Y, D/ M; E page += 13 T6 M; I8 X5 M
- ! I: `# T* i1 R3 _2 t2 h\" E2 K
- conn.close()
- ! b( {* x P5 L/ N
& E9 ?8 B+ R- k. X, X' _. wxpath用着就是舒服~3 N1 _% {0 E# u3 t; }
; q& k% D, l6 A7 Z6 X5 sPython交流群:1047602540 4 Z H9 Y, h4 }8 _1 z" V3 O8 s
$ B; R( ^% d2 k0 N" c( m$ y
8 t1 w5 c, b; R0 F |
zan
|