数学建模社区-数学中国
标题:
Python-xpath
[打印本页]
作者:
檀俾九
时间:
2021-1-18 09:29
标题:
Python-xpath
[code=python]#!/usr/bin/python
7 n; c! t0 p' o+ w8 ]
# -*- coding: utf-8 -*-
c" K6 z# z2 h* n, S5 y: {
$ h/ i0 C: o/ E
import requests
6 U' j& }/ e0 i& u+ a6 t( V
from lxml import etree
0 V& R, b( G2 D9 v! e# Z; v* L _7 ?
import sqlite3
* i/ i- ]7 J( e4 B. l) W
& |' o" Y b! K3 ?. i& f
1 A. A( a' k+ y1 v
def write_sql(c, text):
2 W) N+ A" _ G. b x- h
html = etree.HTML(text)
5 i: p! h J. k
# 标题
4 t( K7 y1 m, C' D
titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
: G9 q2 {3 `+ c1 g
# 链接
4 V @2 ]" r4 a3 s% b* W
hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')
: T- h- `! F. p& [
# 日期
6 A7 a# R9 ~" \8 r _& ^6 i
ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')
& _* P$ B5 A/ F6 t0 O
) |& { a) W) J( S0 n9 B- u b% g
number = 0
* e5 b' j5 \8 h6 Z& r# r
for title, href, em in zip(titles, hrefs, ems):
6 A8 v3 H" G! h! Z
href = host + href
, l T, C+ R0 Q1 H
cursor = c.execute(
; H; Y+ O4 H; K; U8 K
"SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)
3 ? F; y/ R" Y1 N1 i9 O
res = c.fetchall()
' [3 Y* Z# `, s5 ^6 p
# 判断该字段是否已存在
: c- r6 n/ C9 ~9 ~, k8 e
if res[0][0] > 0:
8 ~4 k# m* S* S$ {
continue
7 o% V3 T. ^6 l* T
- a: c. ] x* l" H5 y# m8 _! O: B
c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
) e2 U. w) O0 J( [9 L3 ^. m
href,
) \2 ^) m4 X' r8 d$ ?+ ^8 \& e
title.replace("\"", "\"\""),
* q w% G p1 c
em))
, s# ^$ @+ e/ c3 k* k8 _
number += 1
/ Q: }7 J7 v) g/ D4 y3 ~, e% R8 A* F
print(title, href, em)
, K6 Y7 e5 S+ K4 A, F9 \' A
! c) b; n6 v, Y8 r. N
conn.commit()
3 k( {3 p, n9 l! S: H
return number > 0
- J- E! f0 y$ e1 @
! l! H) r- I% f K4 m, l
2 c7 ?2 @7 Y+ p, F4 G/ R' r
if __name__ == '__main__':
. M0 Z7 Y( j, U9 T
- T8 M+ ?2 } Q+ ^
conn = sqlite3.connect("Python-xxx.db")
: B- Y; z2 o! C: G5 e
c = conn.cursor()
0 S3 Y, j- F: I$ [0 [6 X* g( w! \
c.execute('''CREATE TABLE IF NOT EXISTS Python (
$ }$ K% P' h* ^
Url VARCHAR,
e' Y* R- K4 D7 `2 p
Title VARCHAR,
' z2 N7 P& S/ Q; e
Author VARCHAR
( W" X2 `3 K& h+ Z7 p! n9 x4 a# T
)''')
- v8 ~) V2 B! p% V. z) y" @' K4 d7 m
conn.commit()
, F3 b0 L8 C$ Y. W
% u$ P* s5 h+ v6 G! ^
host = "https://xxx"
/ l! K' B5 x4 C! g8 D
url = host + "/xxx"
( v, p, a, d" e
req = requests.get(url)
: I' S: P& S: \. u6 @; o; ?
req.encoding = 'utf-8'
3 f, F! U1 K5 _7 [8 ^- R" H! \
# print(req.text)
4 T' K" {% U, t5 E: [
* Z {7 M4 ~0 v- R
html = etree.HTML(req.text)
; n6 P2 \/ z' ]! s9 ~1 g9 c
clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')
1 M, B+ ^2 C9 J4 M0 t8 X' t
hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')
3 z; S/ |3 R: I0 j4 f) |% g0 J) Q) s
# print(clearfix, href)
. G4 Q2 K: A0 R1 n
9 P+ F8 J$ B) G1 |
for clearfix, href in zip(clearfixs, hrefs):
" z: E H$ q/ ^0 y! k
print(clearfix, host + href)
+ N3 Y3 t' G ]( A
% a8 n- J, _$ W
page = 1
+ a( A6 A- ]( g4 I
while True:
8 i6 K+ R2 X% t7 H% ^4 a
url = host + href + "/list_%s.html" % page
) o7 i: L# @; I0 ?4 d
req = requests.get(url)
% c0 ] r1 m; c, L g
req.encoding = 'utf-8'
: O! m e- s& e
" P5 T O* F7 i3 h/ D0 T6 k
if (not write_sql(c, req.text)):
$ }/ x& h. ?9 p
break
: p8 U: |3 _. j/ u8 Q( ]
9 v. j6 X1 D1 n4 h% I/ N8 t W
print("第%s页" % page)
. g8 w0 Y" ~& w; Y' |
page += 1
! x0 w ^9 N6 \* g
I- r) Z N) D; [
conn.close()
4 P% {+ _7 I1 Q; j& I: z
[/code]
/ `# o* I# F/ W% U% ^' w
xpath用着就是舒服~
: P; n2 t, C4 Y
2 R! D: R( a3 ^! n
转发自
派生社区
Python交流群:1047602540
- E- _; e7 M/ X, M
* k( a+ t& y- G6 R0 x- [# Y) Q
N$ f, B9 a2 [# m x
欢迎光临 数学建模社区-数学中国 (http://www.madio.net/)
Powered by Discuz! X2.5