数学建模社区-数学中国
标题:
Python-xpath
[打印本页]
作者:
檀俾九
时间:
2021-1-18 09:29
标题:
Python-xpath
[code=python]#!/usr/bin/python
' v$ X0 k- {! i9 g( v1 h9 R, `
# -*- coding: utf-8 -*-
% o+ z% l- h9 Q* Z
- c d" w2 r9 E9 {5 h
import requests
n! P; S: U# j# J. ~. Z# K
from lxml import etree
, Q) Q* d% X8 N) ]
import sqlite3
" k& x) {; }$ ]
# e, L |" V# g! }
. i \ m. L, M" ^3 n9 Q6 }% y
def write_sql(c, text):
/ e0 V) ~2 ^+ R* ^/ l: B9 p
html = etree.HTML(text)
/ C* k7 @& R8 z
# 标题
/ @0 O5 b5 F# I
titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
5 g8 v7 Y3 v t' _( L
# 链接
2 X0 j+ M3 Q. B; ^' `; C9 J
hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')
$ |" ~' Q$ r% N y% b* x W
# 日期
- \* R6 T3 S# j0 w! l
ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')
4 x2 n8 y* T6 ~3 [8 x* F
x0 @$ I1 X* b- r+ R
number = 0
/ g2 g6 P& ]0 a/ |- b F7 L# c$ r
for title, href, em in zip(titles, hrefs, ems):
! ~3 N- @, d( _8 V* o1 |4 F
href = host + href
: v' i+ |: {( z* n9 c3 h
cursor = c.execute(
9 t# c$ J$ v% U- _
"SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)
: E* l' P9 U9 ^3 \ p5 F
res = c.fetchall()
8 r n4 b( T8 @; v. U: P( ~
# 判断该字段是否已存在
# T. o/ E9 T- w1 @4 u5 T1 b( q/ M
if res[0][0] > 0:
+ `: A/ W {- h/ _, e, o( v
continue
2 y. J, m1 F5 a. v) y% B
2 D3 A8 V0 O* j. |4 o
c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
7 b% g% H7 S% U' @
href,
9 g3 S# n; d" u
title.replace("\"", "\"\""),
2 q) j" B) b5 Z
em))
+ }! z: {% F6 d+ l4 I$ E# }. s
number += 1
v6 {! a- D1 |5 T6 u; y5 w9 D
print(title, href, em)
/ h O8 F& z0 r
1 @. T0 J. D# U" t
conn.commit()
6 A. F/ ] j4 J' y, O& p' B3 }
return number > 0
- b: D+ f, @( ?: |; `7 c
* X% u1 D$ s1 p+ l9 A# ?
- u$ Z# j' X8 M
if __name__ == '__main__':
& i" g7 r$ i- N' |, L# q
3 A% Z; c/ _" [6 @- k' Q
conn = sqlite3.connect("Python-xxx.db")
: Y; z' ?6 N) M$ K3 ^/ M" ^
c = conn.cursor()
+ b4 Q$ a" Q- F F$ @- w
c.execute('''CREATE TABLE IF NOT EXISTS Python (
% A+ ?' Z$ z1 f
Url VARCHAR,
) u( R3 ~) n% E
Title VARCHAR,
, W4 `$ l4 y1 q& ]
Author VARCHAR
& | F& C5 N! I. z: ~
)''')
9 d6 R) {, D8 L x
conn.commit()
* D. B6 \$ d4 Z' }2 w. k2 w
; L1 A/ A# ^* P' s
host = "https://xxx"
/ \. A$ A) p7 l% |, v
url = host + "/xxx"
4 d3 \- M. v; M3 E- {' @3 D f
req = requests.get(url)
2 I& ~5 u, y* T; L. y
req.encoding = 'utf-8'
( n' m; ?# Z5 E6 w) h8 D
# print(req.text)
4 k) S/ w' m" }" c2 q( f
9 O" }- Q- [& u, y. K; k3 ^
html = etree.HTML(req.text)
' [+ i' e3 j2 R0 o6 _- F
clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')
+ Y* |) d+ w1 [
hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')
1 d# u5 ]- E7 {) ~5 p! H. P, F B& B6 y
# print(clearfix, href)
6 s* f6 X1 O1 h, d5 [2 P8 M2 k# Z3 x
, H+ Q5 \- m% I+ ]! `- N. n
for clearfix, href in zip(clearfixs, hrefs):
! M1 b$ n n1 _$ }1 n! w
print(clearfix, host + href)
3 ?- ]: A% Z: B: ?; M
: f8 { u* ^! @5 Q( D
page = 1
j$ o+ P7 @& m6 M1 P1 c
while True:
6 `( U& m; c3 v
url = host + href + "/list_%s.html" % page
4 m, z, u" O" O1 Y- g; w
req = requests.get(url)
! |" m+ B. X8 i( a+ w4 j G+ z
req.encoding = 'utf-8'
5 Z7 E5 r3 c5 I# c6 M" b) K, ]
0 b# M K( H1 X' n8 t) ]+ v
if (not write_sql(c, req.text)):
( } v! v! Y. f, D" ]
break
* G7 F/ d' w ]' _
+ X) f( c# A1 d( \, C
print("第%s页" % page)
. N2 [7 z. q) w* V
page += 1
( M4 ~( M2 V9 K0 n' D, m" i4 t: U
+ \2 Q% q; D m2 t
conn.close()
; K( z! H) U: L7 d
[/code]
: c0 Y8 K3 R# r* V. `
xpath用着就是舒服~
) d ]: {- v( T5 M- f4 t' W! h
7 G5 B2 C6 C* H A3 [) o" }
转发自
派生社区
Python交流群:1047602540
& y& w/ L8 m, q% m( H+ I8 p# I
0 \2 T4 Z, D+ |3 o
6 n! Y0 A, G F2 F
欢迎光临 数学建模社区-数学中国 (http://www.madio.net/)
Powered by Discuz! X2.5