数学建模社区-数学中国

标题: Python-xpath [打印本页]

作者: 檀俾九    时间: 2021-1-18 09:29
标题: Python-xpath
[code=python]#!/usr/bin/python
: b6 V% [$ l. @6 o2 `# -*- coding: utf-8 -*-
+ B5 D# |( k( ~. B' N& j: X( \: G0 m! W: ~8 x
import requests$ Z  d* z- p1 x& l( D; R% D
from lxml import etree- r" w, C) W: t/ _9 h1 ]. d
import sqlite3
4 T1 \9 y. {, j6 g) [2 V' U# N  ^& {  L$ N4 Q. B& k

4 V  _9 p8 B' J6 f+ Xdef write_sql(c, text):& r) {4 s( j+ R
    html = etree.HTML(text)' ]+ M: ~7 R; a2 c0 z) }
    # 标题# u, P1 ^3 a1 J- E6 n
    titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
' y& E/ M2 R, V% V3 b* C    # 链接
& G- p& Z; N. q7 u, p$ W, Q    hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')- z0 Q2 S  O* \3 X$ a
    # 日期( W% W6 M2 @  z0 W% l. {1 g# _
    ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')
& e4 j; p: }8 y
4 ~/ D" {" U- ^4 n    number = 0# k% r! [$ X: `5 l5 D7 Z1 q1 m; p1 @
    for title, href, em in zip(titles, hrefs, ems):
; y$ r4 h4 r; I, l+ f2 B4 j, f) f        href = host + href: e+ q3 l* w, n6 a0 ~  o  s
        cursor = c.execute(- P- T8 y7 b; R8 u) \  W. Z2 Z
            "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)! b7 H! D" j$ P: p4 a
        res = c.fetchall(); A3 P6 D8 r6 r* B) {* {" ~
        # 判断该字段是否已存在7 Q4 X# B- m. o- j( W
        if res[0][0] > 0:5 l" D; M) Y# H0 y' B1 p' Y. A
            continue) ^- t& f- s5 w: y% k' o
8 d. i& q) n8 r7 i4 D- P0 G
        c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
( f) K. z3 ^1 c+ h( c            href,. ?) s: J8 W4 S8 T, @" v
            title.replace("\"", "\"\""),. Q/ {. }3 Q5 n. `2 Q
            em))
) W3 i0 l! e  Y2 x' |) H  [        number += 1
3 y/ ]# e) ?7 D: z2 F- ]        print(title, href, em)' w0 D% C3 p5 \4 [

, W6 Z8 s0 V; ~; B, |; o; i! _    conn.commit()2 B# S1 B7 J5 \# g1 u  P' c6 a
    return number > 0! C( |% ~8 h& n+ q5 x
, V+ R$ b7 o7 M5 k
4 d  _4 u; ~/ Q# b. \5 y* p0 a# l
if __name__ == '__main__':5 |% d1 o2 _5 K8 R2 }: ^' h
$ R% K* S! |: l  x4 R2 r! |- C
    conn = sqlite3.connect("Python-xxx.db")
; a4 \) V# B+ N% u, ~  |# ~4 k    c = conn.cursor()
; K9 C. l% ?* e    c.execute('''CREATE TABLE IF NOT EXISTS Python (2 w& B3 S* D: {1 o: V. C
        Url VARCHAR,' l6 U; I0 v# t  K5 a0 D% E
        Title VARCHAR,
$ b0 b# U1 u. E* p% f: h" W6 m: h        Author VARCHAR
  F4 q, t8 ]( z6 A, c$ o    )''')8 r# v6 [5 f: `7 Q
    conn.commit(): d2 Z7 L  |# [1 W7 k) e# l
# A' M! C6 }4 x8 ~( Z
    host = "https://xxx"- q, C4 V' U' F" m9 F4 J
    url = host + "/xxx"
% X/ L. n/ D$ W0 T# q    req = requests.get(url)
1 s: b+ w) l' C0 n    req.encoding = 'utf-8'2 s, J8 U9 t3 _, T0 z: a
    # print(req.text)
0 H+ ]5 [, l, k
! B2 E7 h2 @" r# o7 H' M    html = etree.HTML(req.text)
; x: l! ]/ j! p8 i  V3 d    clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')
- Q/ T! H% D) M0 d8 I2 X; n! Y    hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')# e0 i4 U+ ~2 }. \4 ^9 {$ u* p
    # print(clearfix, href)
, ]' O+ x+ Z! o
5 C& Y+ d" y- ~3 W2 h4 b    for clearfix, href in zip(clearfixs, hrefs):& Z5 A6 O+ P  }+ Z1 g$ l
        print(clearfix, host + href)) n' Z# `: I0 O* E3 f
  |7 I: G. Z8 i2 x
        page = 1- F# b7 I, A, H0 T' D; @: h+ j$ ^
        while True:1 ~4 ^0 k$ N, L1 p6 F: L% S
            url = host + href + "/list_%s.html" % page# Y; t7 L$ P& \' Q
            req = requests.get(url)
4 P2 v( Y" B; ]. f$ Z7 A) n            req.encoding = 'utf-8'
7 s1 k5 `6 @+ b+ E1 [2 S. D
6 F* i& S7 d5 {            if (not write_sql(c, req.text)):* [& |. N) |6 P$ s9 J
                break  r. }7 f7 c/ C4 A  \& K0 G

0 I3 E6 z. z4 S% m1 g( V            print("第%s页" % page)
9 F% p) }( [! c% p            page += 17 U% M/ k" L# ~& [" c) k
, U- f  ~# m" w+ s% O' Y0 ~
    conn.close()2 w8 t- K8 A9 {5 W$ C
[/code]5 m* s/ v1 z# a) T  s7 W
xpath用着就是舒服~9 q1 Z! e2 {3 c  a& v0 k

8 U5 A7 K9 s4 |" j. _' y
转发自派生社区
Python交流群:1047602540

% e, Z: I9 o: L: k  K
( G+ \" w) g- b7 `# V9 k7 m
9 W: w, U) r1 c* G! v1 b" |




欢迎光临 数学建模社区-数学中国 (http://www.madio.net/) Powered by Discuz! X2.5