- 在线时间
- 7 小时
- 最后登录
- 2021-1-20
- 注册时间
- 2021-1-9
- 听众数
- 3
- 收听数
- 0
- 能力
- 0 分
- 体力
- 23 点
- 威望
- 0 点
- 阅读权限
- 20
- 积分
- 10
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 7
- 主题
- 7
- 精华
- 0
- 分享
- 0
- 好友
- 0
升级   5.26% 该用户从未签到
- 自我介绍
- 我本名为我,那就是我
 |
 - #!/usr/bin/python
- ' p7 t. s- J, B. u6 B/ Q+ |6 Y# -*- coding: utf-8 -*-
- % y; ]0 b, A; K2 _- |' f
- 2 v$ ^8 x6 E* timport requests! A( q- {: z( c- l# g
- from lxml import etree) f& z% h M R% N8 l
- import sqlite36 o, W\" B' v+ ^4 A' m8 N
- * i) ]9 ^1 L% ~* d5 y1 {3 s# V# j1 [( }+ }0 o' U1 u% O
- def write_sql(c, text):6 h8 D ?: O5 j
- html = etree.HTML(text)5 Y* _/ D9 v% B3 e# K+ H
- # 标题( L H* c% v% k9 b
- titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
- ; r+ {% t. j8 {1 I) {& E2 w # 链接3 h- K) ^( e* I% q\" A) U- I0 d
- hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')) y- i/ T# k' g' D' ^2 {
- # 日期
- 4 g( M% G. `. S ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')
- ( F! V7 P7 x8 `' z4 q\" [' m7 T
- 0 M$ Q2 ]6 z( e\" E. x/ I& K number = 01 J0 R# z7 Q( ^7 @9 U. d
- for title, href, em in zip(titles, hrefs, ems):) E: G! m& H0 u1 q, |
- href = host + href3 L* w4 J3 N! L/ n# S- u
- cursor = c.execute(
- : { h8 B6 u9 }- g\" A "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)
- # L0 {/ G; ^3 K* ^# z# n1 L0 j res = c.fetchall()( W- D1 o/ p h; X8 i
- # 判断该字段是否已存在) h, s+ X$ k/ | ?# `3 g0 N
- if res[0][0] > 0:7 K& T) q9 }4 M4 R8 N5 k
- continue
- 6 E4 R- F7 n1 S% a% b& x
- ( v& O# D5 L6 Q6 b c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- ! s$ D7 Z8 ~& O9 X8 s4 [# j) U6 y href,# C\" p, ~; \' c4 K. ]2 ?0 S, A
- title.replace("\"", "\"\""),$ } \. n9 E' E
- em))( U: p/ @) I4 X( R
- number += 1
- 0 c/ i5 u/ ~2 _% u3 c print(title, href, em)
- 3 N7 q0 @' J\" l; h2 @$ G2 m; d7 V$ b\" N* T) f. u9 Q, t1 x
- conn.commit()
- 5 O6 D! H5 P U\" o! f& W return number > 0 _; `3 U% L* G% c
- % O4 b& s' T$ |5 z5 J% r0 C3 @/ n* @8 z( b' j2 u, a
- if __name__ == '__main__':; _* k Y: {- Q0 B6 c
- . q% n% i7 l6 J3 U# X, L, a9 l8 V
- conn = sqlite3.connect("Python-xxx.db")
- # h* a) V- B4 x8 e% g7 K c = conn.cursor()
- 0 r; X! \/ T4 W$ B- j c.execute('''CREATE TABLE IF NOT EXISTS Python (
- 0 j8 U: I1 A* h) u$ d( j! @+ g Url VARCHAR,
- 2 _! o1 l7 n1 R Title VARCHAR,
- 3 p4 ?/ E% b& f+ y( \ P8 e6 D; [5 o Author VARCHAR1 m: b3 {( i @$ V3 S9 ^& C/ S
- )''') \3 ?- c! @\" A5 v* Q( [$ H- A
- conn.commit()2 K7 ]) D/ |# k; q% |+ b1 `
- ' C |\" s9 Q) {* }4 U host = "https://xxx"/ i/ c; D' ]5 r% O
- url = host + "/xxx"
- $ f( N; `5 _# J: \ req = requests.get(url)
- * L3 v/ ?2 f& Q req.encoding = 'utf-8'
- ! B0 l\" d' a9 i' ~9 }5 P # print(req.text)
- 3 T& j5 G( {3 i$ A( l8 u
- 2 K- ~. t2 {\" C: |% n+ a html = etree.HTML(req.text)
- : l U/ `! B7 ?- b1 { clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')
- 7 v3 ]0 C: c) I/ ^8 T6 T hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')8 i/ [2 w. b2 y* M
- # print(clearfix, href): z3 U% B$ c$ L! l h
- 5 V9 C% V! M+ t6 V0 k* U3 C
- for clearfix, href in zip(clearfixs, hrefs):
- + v, J; ^3 S% {9 a; m7 y3 g print(clearfix, host + href)
- 9 B: @/ M! W% [: `9 i4 u. R- @6 m# ^& G6 [3 N\" V
- page = 1
- 8 Z2 n- f, U& m: k# g! Y3 { while True:9 ~& L9 c\" f) [/ Y8 R
- url = host + href + "/list_%s.html" % page3 r\" K/ Y6 v9 y% |: ~\" S
- req = requests.get(url) o- i4 a2 k6 m% j8 j/ p. L
- req.encoding = 'utf-8'; A( \) k\" H) D
- : @. [\" c/ v* Z. A J9 M if (not write_sql(c, req.text)):
- ; ]2 y7 I/ Z4 _. Z1 e# g break( A( k% c+ }\" G7 E* ~( i L
- 5 M7 F5 E' n3 z$ W/ J\" y+ u; h6 l print("第%s页" % page)
- 4 y\" F\" e2 }! Q4 v! E page += 1
- 9 T9 |# m- m7 ?5 q) X2 H
- 3 |+ V! o& U) I4 L0 f8 N) @ conn.close()- c' w' k' h& c4 |1 _
5 K( a/ \; Z2 U
xpath用着就是舒服~, V7 h3 e4 A. |. K7 d7 H3 z
( t2 Z; r! y# W8 V. s) T3 oPython交流群:1047602540
; s3 T$ {$ {/ Q' a0 M
! |& O8 e8 B6 [4 J8 ?9 o1 M3 @) Z8 z$ J; d1 {( Q5 h6 N4 \2 o
|
zan
|