- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- & v6 \0 K1 @- i) F( I- H# -*- coding: utf-8 -*-9 m4 |7 a) b$ _, ]
- ( o\" a/ }- a$ A+ x- B% N# [( j8 J
- import requests
- : z. o\" c& y6 p9 p/ n* _! @, \' D( ]from lxml import etree
- 9 U3 _/ \# f* f! |; U& i( A7 r5 Uimport sqlite3# M9 c1 P4 K8 U! r$ K3 [
- s' `$ ^) f. W5 K. A5 J M# Q+ ?: Y
- $ I9 X/ ^4 q9 h8 G! |def write_sql(c, text):0 l$ f, j( }: S- R0 [! [# h
- html = etree.HTML(text)+ u e' w\" i C$ b
- # 标题( L; U' T, P4 {5 G
- titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
- \" n! s8 |# n/ h' `3 R, b # 链接
- 6 K5 I1 G. S! s hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')) V4 e: o! t* K/ T, ?( q/ r6 \
- # 日期2 B3 e9 O9 f- Y& L3 c% c) j
- ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')7 V( T3 L8 H) X2 z, X% p# u
- / w2 K l5 m9 g, C% |$ }9 I
- number = 05 d& k2 I3 B3 }' v- V4 v9 D
- for title, href, em in zip(titles, hrefs, ems):6 B# l4 O& z. v9 ?. K\" f& f* v9 Z) v. T
- href = host + href/ H+ m! r* p. E9 N( j6 X1 ]; j
- cursor = c.execute(
- 4 a. J( o( M( y1 F4 ?9 w* M7 G8 u "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)
- ' Z& F; n& v4 ~6 {+ ] res = c.fetchall(); C2 D, r7 U7 g
- # 判断该字段是否已存在, W( \, a* i# q9 T! J! [, J! p
- if res[0][0] > 0:6 k/ w2 W: ^' C) D\" W7 O3 F& y* L: r
- continue
- 8 Y9 y3 X: d3 b7 l) P- a- b/ d/ ^; F) T
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- 5 S; V1 F6 ^# ^( @9 | href,
- $ w\" E4 R# w8 u0 T8 m1 I+ @- m title.replace("\"", "\"\""),* Q5 k! Z\" x( `' q8 Q; M# f
- em)) A' T8 r1 c7 b& W
- number += 1! w F( P+ T* F8 i! [3 g7 H. k8 z
- print(title, href, em)! |5 D& E) N+ V4 H
- 9 I3 o; h4 j, r( W1 y conn.commit()8 O- X9 Y6 ^+ a1 v$ W
- return number > 0
- 1 ]# p) o; }! r* \, B4 c' i) V\" O\" U& ^% `* y
- 8 `0 m; z k! C- ]# G7 ^+ {if __name__ == '__main__':& d# v# ]/ S9 B, {; Z% h' J
- \" M1 P% i+ R2 q4 I. P* Y conn = sqlite3.connect("Python-xxx.db")3 `% B& H* K% E\" L Q
- c = conn.cursor()
- : T\" \/ s Y, `7 m c.execute('''CREATE TABLE IF NOT EXISTS Python (
- 1 ^1 E) ?\" B- e2 M; ~ e0 z' d Url VARCHAR,
- 2 x5 x5 Q6 Y( h Title VARCHAR,! E, W+ r; h+ U0 H
- Author VARCHAR0 J9 G1 O. b- V6 O6 U5 }, x% n
- )''')8 c& Q ?0 a- d, ]
- conn.commit()
- 0 Q. H' {& i\" k* F: Z) G( D
- & n+ m$ K8 r% d$ ]6 T6 @8 N host = "https://xxx"
- , @\" g }8 B, ?$ a url = host + "/xxx"; z; u) t5 q- T6 v3 Q
- req = requests.get(url); q: v' Q4 J4 }0 ^1 e3 I0 ~
- req.encoding = 'utf-8'\" A' {* j# h' ~! K' m' F
- # print(req.text)! j6 @. r/ S- r) v i( l# @2 v
- 3 o* D# ^1 I1 p; r$ r7 W Q9 H( c( T
- html = etree.HTML(req.text)\" `- m0 h1 e0 m% P/ r3 `: x( F( W
- clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')
- ' X% @# m j f2 `3 Z& }4 S3 G8 }( _ hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')* F1 x! ~8 X5 O$ J' Y, J; q) b
- # print(clearfix, href)
- 4 x( K8 I% F+ i# D* I: l5 V; a u! ^6 X& m! l' c
- for clearfix, href in zip(clearfixs, hrefs):
- $ A\" z8 v& ~1 a' R- J. b: v\" e1 I print(clearfix, host + href)
- ' O5 C1 z, b4 }\" G2 \/ `
- 6 G\" Y. {5 a; A' {. K0 y* e page = 1
- 1 g# s U X\" j% u' S9 g' _/ k while True:
- ' E0 }5 B: i& W3 W) H# c url = host + href + "/list_%s.html" % page
- 8 {3 h' U* c8 l3 j req = requests.get(url)
- 2 l; ^, `+ D N+ [7 h, Y# M req.encoding = 'utf-8'* l/ n( m8 {. c5 S6 x
- 4 s2 t' f+ O% z: F# C/ }
- if (not write_sql(c, req.text)):4 T% r1 o! M- X1 z% H9 Q\" }' @\" A
- break
- 4 G5 A$ {& t# I4 q$ E, P4 h) ~# T7 @
- print("第%s页" % page)- n0 {0 \. ^5 }: Q5 w8 v* b
- page += 1
- . L4 c; E1 ^3 k) F9 E# m5 l
- 1 z5 g5 S\" j/ f- m' H j+ c4 y conn.close()
- 6 I7 O, p9 K t5 t' u- N+ G' A! B
7 P: E2 z3 E+ fxpath用着就是舒服~
5 D* F$ d, a) E3 P. Q. u6 L1 f( \( }
Python交流群:1047602540 ) c) \+ e0 S$ Z
' H/ q/ z, x2 B* R8 n8 K, M* r' G7 {) r5 c8 W n1 V* k5 [
|
zan
|