数学建模社区-数学中国

标题: Python-xpath [打印本页]

作者: 檀俾九    时间: 2021-1-18 09:29
标题: Python-xpath
[code=python]#!/usr/bin/python
7 n; c! t0 p' o+ w8 ]# -*- coding: utf-8 -*-  c" K6 z# z2 h* n, S5 y: {
$ h/ i0 C: o/ E
import requests6 U' j& }/ e0 i& u+ a6 t( V
from lxml import etree0 V& R, b( G2 D9 v! e# Z; v* L  _7 ?
import sqlite3
* i/ i- ]7 J( e4 B. l) W
& |' o" Y  b! K3 ?. i& f1 A. A( a' k+ y1 v
def write_sql(c, text):2 W) N+ A" _  G. b  x- h
    html = etree.HTML(text)5 i: p! h  J. k
    # 标题4 t( K7 y1 m, C' D
    titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
: G9 q2 {3 `+ c1 g    # 链接4 V  @2 ]" r4 a3 s% b* W
    hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')
: T- h- `! F. p& [    # 日期
6 A7 a# R9 ~" \8 r  _& ^6 i    ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')& _* P$ B5 A/ F6 t0 O

) |& {  a) W) J( S0 n9 B- u  b% g    number = 0* e5 b' j5 \8 h6 Z& r# r
    for title, href, em in zip(titles, hrefs, ems):6 A8 v3 H" G! h! Z
        href = host + href
, l  T, C+ R0 Q1 H        cursor = c.execute(
; H; Y+ O4 H; K; U8 K            "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)
3 ?  F; y/ R" Y1 N1 i9 O        res = c.fetchall()' [3 Y* Z# `, s5 ^6 p
        # 判断该字段是否已存在: c- r6 n/ C9 ~9 ~, k8 e
        if res[0][0] > 0:8 ~4 k# m* S* S$ {
            continue7 o% V3 T. ^6 l* T
- a: c. ]  x* l" H5 y# m8 _! O: B
        c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % () e2 U. w) O0 J( [9 L3 ^. m
            href,
) \2 ^) m4 X' r8 d$ ?+ ^8 \& e            title.replace("\"", "\"\""),* q  w% G  p1 c
            em))
, s# ^$ @+ e/ c3 k* k8 _        number += 1
/ Q: }7 J7 v) g/ D4 y3 ~, e% R8 A* F        print(title, href, em)
, K6 Y7 e5 S+ K4 A, F9 \' A! c) b; n6 v, Y8 r. N
    conn.commit()3 k( {3 p, n9 l! S: H
    return number > 0
- J- E! f0 y$ e1 @! l! H) r- I% f  K4 m, l
2 c7 ?2 @7 Y+ p, F4 G/ R' r
if __name__ == '__main__':. M0 Z7 Y( j, U9 T
- T8 M+ ?2 }  Q+ ^
    conn = sqlite3.connect("Python-xxx.db")
: B- Y; z2 o! C: G5 e    c = conn.cursor()
0 S3 Y, j- F: I$ [0 [6 X* g( w! \    c.execute('''CREATE TABLE IF NOT EXISTS Python ($ }$ K% P' h* ^
        Url VARCHAR,  e' Y* R- K4 D7 `2 p
        Title VARCHAR,
' z2 N7 P& S/ Q; e        Author VARCHAR
( W" X2 `3 K& h+ Z7 p! n9 x4 a# T    )''')
- v8 ~) V2 B! p% V. z) y" @' K4 d7 m    conn.commit()
, F3 b0 L8 C$ Y. W
% u$ P* s5 h+ v6 G! ^    host = "https://xxx"/ l! K' B5 x4 C! g8 D
    url = host + "/xxx"
( v, p, a, d" e    req = requests.get(url)
: I' S: P& S: \. u6 @; o; ?    req.encoding = 'utf-8'3 f, F! U1 K5 _7 [8 ^- R" H! \
    # print(req.text)4 T' K" {% U, t5 E: [

* Z  {7 M4 ~0 v- R    html = etree.HTML(req.text)
; n6 P2 \/ z' ]! s9 ~1 g9 c    clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')1 M, B+ ^2 C9 J4 M0 t8 X' t
    hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')3 z; S/ |3 R: I0 j4 f) |% g0 J) Q) s
    # print(clearfix, href)
. G4 Q2 K: A0 R1 n9 P+ F8 J$ B) G1 |
    for clearfix, href in zip(clearfixs, hrefs):" z: E  H$ q/ ^0 y! k
        print(clearfix, host + href)+ N3 Y3 t' G  ]( A

% a8 n- J, _$ W        page = 1+ a( A6 A- ]( g4 I
        while True:
8 i6 K+ R2 X% t7 H% ^4 a            url = host + href + "/list_%s.html" % page
) o7 i: L# @; I0 ?4 d            req = requests.get(url)% c0 ]  r1 m; c, L  g
            req.encoding = 'utf-8': O! m  e- s& e

" P5 T  O* F7 i3 h/ D0 T6 k            if (not write_sql(c, req.text)):$ }/ x& h. ?9 p
                break
: p8 U: |3 _. j/ u8 Q( ]
9 v. j6 X1 D1 n4 h% I/ N8 t  W            print("第%s页" % page). g8 w0 Y" ~& w; Y' |
            page += 1! x0 w  ^9 N6 \* g
  I- r) Z  N) D; [
    conn.close()4 P% {+ _7 I1 Q; j& I: z
[/code]
/ `# o* I# F/ W% U% ^' wxpath用着就是舒服~: P; n2 t, C4 Y

2 R! D: R( a3 ^! n
转发自派生社区
Python交流群:1047602540
- E- _; e7 M/ X, M

* k( a+ t& y- G6 R0 x- [# Y) Q
  N$ f, B9 a2 [# m  x




欢迎光临 数学建模社区-数学中国 (http://www.madio.net/) Powered by Discuz! X2.5