数学建模社区-数学中国
标题:
Python-xpath
[打印本页]
作者:
檀俾九
时间:
2021-1-18 09:29
标题:
Python-xpath
[code=python]#!/usr/bin/python
1 O+ ~; Q4 z( _# k7 B
# -*- coding: utf-8 -*-
/ h2 t5 p9 Y( h1 {5 i# `
" k s( b$ k V" [1 K
import requests
: ^$ E* d! w u. g _% o6 `) U: d
from lxml import etree
; G8 |+ K, T2 Y( n! E5 O6 W- V
import sqlite3
! N+ {1 F$ a* {
( \1 P- S& Q6 j( D" Y
, Z& o& l" L" F; U
def write_sql(c, text):
4 c7 h7 P7 `* L" u) M
html = etree.HTML(text)
* |. P7 v. S8 b, m+ t$ @
# 标题
2 M: r# x( {: |
titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
n$ D# R2 M4 A6 S/ u, m3 B; w& {
# 链接
# n Q8 Y+ g8 D+ @, b% o
hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')
( D6 ~9 W. H- R( j; S4 X
# 日期
$ O2 Z) ], ?: G: |3 ]+ m
ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')
d9 M& T- |- e6 [0 ~8 T* z
% p+ |2 T" L* l& ^$ G5 K/ w: J
number = 0
* E, H q4 ~! {7 y+ s! H: R
for title, href, em in zip(titles, hrefs, ems):
* W$ V! u4 ]/ c0 G' M. F+ O1 z9 E
href = host + href
% U. o2 E1 k- c! c3 k" X
cursor = c.execute(
O* I/ T, n U: e! \, [
"SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)
7 K0 o# Z- W7 n
res = c.fetchall()
+ a0 b1 }1 C. B8 i6 _9 n/ }
# 判断该字段是否已存在
, @8 |( x3 I: r5 e: G: v9 @4 d2 n
if res[0][0] > 0:
1 ?$ [4 w- o4 |% Q
continue
/ t5 S7 X, k2 G& d3 V7 _2 b
; h; y9 b7 T, E; i9 ]3 Y
c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
8 S: n* \. @) u s8 u
href,
9 f+ S& w: \4 W4 {1 L/ j' A o
title.replace("\"", "\"\""),
8 C& s* |* C0 Y3 E
em))
7 P% x! c) F8 A
number += 1
6 ?" u6 \. `( K9 q! f; c
print(title, href, em)
0 t6 M( c! S: I6 j1 f" t
: P4 }% g/ `7 p4 k |) ^8 V
conn.commit()
+ ^% a: Q9 S( A( G
return number > 0
- O7 L& u& m; }, K/ O
5 D, H: n- t3 \3 c: L, C2 Z$ S w# u
$ B% _. S# o- R% _9 `
if __name__ == '__main__':
# e5 h# E2 ?. J0 |* {
: S9 k6 G, g# u% ?: X
conn = sqlite3.connect("Python-xxx.db")
% E# S. Q \" V7 b# _
c = conn.cursor()
0 U7 r3 O) D+ m* ^
c.execute('''CREATE TABLE IF NOT EXISTS Python (
A1 z* ^. ~) [+ r c0 F
Url VARCHAR,
2 U, N; L* f' X. y
Title VARCHAR,
/ T% a1 v; ~1 g8 o
Author VARCHAR
' P9 @$ k/ A M& C1 d) c' C
)''')
4 p: I/ M- k: \6 D$ \
conn.commit()
! l' E8 H$ G f% W3 q; _3 E7 D
& N* k! v, @" z% u) J- I8 }; q& A
host = "https://xxx"
6 n/ {& d: A9 E, [9 H, O: z
url = host + "/xxx"
$ x' {2 l% w) v9 v2 h
req = requests.get(url)
5 ^7 T; {( i- P5 }
req.encoding = 'utf-8'
, H& L! ^1 g9 N' y, E# s' S+ Y
# print(req.text)
( ^' D1 a5 I8 _% r+ y
5 ~ B) l! z& r( }
html = etree.HTML(req.text)
% r. r/ b' x( j# ^& {9 g
clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')
- a6 G5 s9 g5 p" e; _$ R$ T
hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')
# `6 r _' H8 x- {
# print(clearfix, href)
q+ P7 r3 y2 n, L$ i
( B7 P- ~6 V4 L8 U" K. P/ d
for clearfix, href in zip(clearfixs, hrefs):
' X5 f/ [7 ^( t8 {; E1 p- f7 a
print(clearfix, host + href)
2 S0 Z0 C3 t7 u9 K6 s0 W1 N" W# @
/ V2 b$ K2 U$ f2 L S; h* D
page = 1
7 ~0 C8 L6 X" Q: j
while True:
* Y% M, j; Y8 _2 Y& X
url = host + href + "/list_%s.html" % page
- _: M* l; Y) g; J8 M3 {5 J
req = requests.get(url)
8 H# E5 w- X# d% X9 }0 U1 c
req.encoding = 'utf-8'
. h+ f) [* Y8 v0 W$ D
8 @3 ^% N' M$ j r' |5 T
if (not write_sql(c, req.text)):
3 a' `1 B( x: h% z
break
6 t1 y- b% R7 h& [8 i: N
' _# t! i% ?1 t3 M
print("第%s页" % page)
; p6 V; @+ S! t0 j `/ {2 J
page += 1
- l$ R2 D4 O' F/ p2 v
+ v4 g+ n" I$ p# ~9 |) | X
conn.close()
4 m# a0 H& G* Z+ b( `0 J' e; ~
[/code]
1 K) ]5 b2 r' U3 D$ W3 u& F
xpath用着就是舒服~
2 p: M7 L7 k4 z
3 F, x# h9 T. G% L3 f: Y+ ?* D$ }
转发自
派生社区
Python交流群:1047602540
3 f$ V3 L* I; }. b$ o3 E/ g
0 q6 g8 k2 k2 F8 l, ^
& E0 v! |3 y; w
欢迎光临 数学建模社区-数学中国 (http://www.madio.net/)
Powered by Discuz! X2.5