- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python/ L2 K3 \0 |) f% K) h
- # -*- coding: utf-8 -*-
- 2 k# L9 ^# ~, e1 D$ Q) _$ k h1 _# n1 O8 A3 k9 ]- y# y
- import requests
- ' @- M7 Y4 M+ ?$ `from lxml import etree- o5 S- f6 v: X$ D( z+ }
- import sqlite39 k) A9 y% ^# y! S. C
- ! O% C, D, a1 B7 j8 k5 D9 x
- ) d, G4 |6 _; L$ p
- def write_sql(c, text):
- 8 q) ]7 C, E5 K5 N4 ]2 s html = etree.HTML(text)8 g Q# ~* P$ t2 v; u/ W7 g% [; J4 ]
- # 标题9 o: T! D+ ^% c% A
- titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
- + d+ V6 s0 R8 h* H, z, R3 S2 d # 链接* R' t* J2 R! I0 y. [$ {2 d0 f2 |
- hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')
- / k0 n, _. B3 e1 f9 V # 日期
- . O' Z) L* q% e$ t: @' L! d- T, J ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()'): c% A& m) M4 ?3 R1 M5 g; t7 K+ G( T/ i
- 1 l% b9 v; L) ~. z\" U D$ c/ M
- number = 0: L8 n\" p7 u8 {8 p
- for title, href, em in zip(titles, hrefs, ems):
- , l. n8 l8 J- r6 U' c1 a href = host + href# G3 Z+ _* G' f9 k
- cursor = c.execute(
- ( j- O, J# `6 [) i) j2 O "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)8 H2 T. r& M- J
- res = c.fetchall()
- + t7 N: e4 |! i- N5 B6 [\" t) L1 \ # 判断该字段是否已存在4 a$ n3 p/ X0 T& {$ K- N3 z* b
- if res[0][0] > 0:
- 1 ]$ B ]! p, c2 L3 l; |; s continue$ V1 ~' g0 M6 L5 L
- \" _) ^( J8 _- |, K, s c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (% i: P, ~/ Q' W' ]5 U
- href,6 x$ ^2 Q3 \5 V& m7 V6 u
- title.replace("\"", "\"\""),
- 0 V5 c6 u+ f( i7 q9 M5 o; b\" W. n em))
- 4 [* \+ c) s2 Y. s number += 1
- \" l; K N w$ m* \# a5 t9 B& s; L2 S print(title, href, em)
- ; o- M' h4 y) k# T6 \8 J* I- Y, m c/ x+ A+ ]( A
- conn.commit()7 T7 e2 ]\" p% I
- return number > 06 w- K6 W% E( ^9 o( B! D\" u; K4 F
- - ^+ ?0 p5 U9 j* X, T: E$ D
- % U8 |8 ^3 b( nif __name__ == '__main__': ~7 E* K1 L2 r b6 g
- ?& Q( O' X# j5 M
- conn = sqlite3.connect("Python-xxx.db")
- p! I) I. d( I c = conn.cursor(), R5 ^5 |$ D! N3 S0 G. R
- c.execute('''CREATE TABLE IF NOT EXISTS Python (
- 5 B' j0 `/ J- l c. a Url VARCHAR,0 T! ?9 \) }: a
- Title VARCHAR,2 t$ o' h, f7 U. C( K1 `
- Author VARCHAR3 w6 w- x& u\" i& y1 p
- )''')# M* e) P4 p# H# g) ^! U* s
- conn.commit()
- 9 Y# j& K( o' ~8 T7 V1 h2 o
- y7 J, b\" `' E. N+ \ host = "https://xxx"
- & D\" @- Q0 C5 t7 [8 Y: M7 [ url = host + "/xxx"/ y Z( [) \7 \* I/ b: w1 t
- req = requests.get(url)2 O' o( q4 D% C9 {- k9 {! P
- req.encoding = 'utf-8'
- ) `' d\" _5 B( M1 |9 q2 ^ S. f # print(req.text)
- ' ^( m9 P( `! t1 ^3 Q, {0 j: O
- ; ?( s* j9 X! ~+ _2 Y html = etree.HTML(req.text)
- 0 L9 ]# T0 g J clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')( r+ d4 ~+ J: h4 W; j# q* p
- hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')
- ) R: B/ A. d& \: L7 i, F1 | # print(clearfix, href)
- 0 a$ i3 `% i H( [/ p/ H, E R( J! M\" E3 g- A! n. l; s1 j/ [0 f
- for clearfix, href in zip(clearfixs, hrefs):
- # D- U9 @% t* S$ H5 ]0 J# v print(clearfix, host + href)
- & s& p, F ^& |0 z: E, Z1 |0 I: ^/ r% \* t' p6 I9 ]+ d4 `& p5 R
- page = 1
- . O% z- u( D2 n+ v while True:9 w6 @+ I; e$ g: ~\" @
- url = host + href + "/list_%s.html" % page/ L/ p1 @ q- Y: q4 _ r
- req = requests.get(url)
- 5 F. r$ u2 Y- r% m req.encoding = 'utf-8'
- ! ~* t6 X' i& I. f0 o( _9 x+ b9 T5 d0 j, x\" t- b+ q
- if (not write_sql(c, req.text)):
- * H: g\" u& P: D break
- ! |6 G/ o3 d8 M\" W \. W
- / W w+ a. Y6 ?. ]1 G9 A1 P1 P print("第%s页" % page)/ b- Z( N* u) ]2 `! r; }
- page += 1
- 3 x- H2 ~, q! H% Q4 S2 I5 t+ d
- ( s3 @. W\" z0 S conn.close()
- 6 h9 x; E/ G/ J' f5 _( Y
?; S9 A+ V0 v) zxpath用着就是舒服~
0 \; R m. O$ x7 B; R9 G7 e. E+ j) I6 U7 N
Python交流群:1047602540 : m. W2 Z) z2 g' H; J# V6 J; Q
2 s# d- n# ?0 h: z3 H* @ l
/ h c, ^% ~+ s3 v# b |
zan
|