数学建模社区-数学中国
标题:
Python-网页爬虫与Sqlite3
[打印本页]
作者:
檀俾九
时间:
2021-1-14 08:44
标题:
Python-网页爬虫与Sqlite3
[code=python]#!/usr/bin/python
' d2 }9 e2 I% p. _4 g3 i$ G4 K
# -*- coding: utf-8 -*-
6 ^ k1 Z7 \# a
0 M1 o% u6 P+ h' @# S4 ~0 ]
import sqlite3
& A" v: r# N: v) E/ n( s4 ~ W
import requests
+ e+ D2 I; ^+ `
from bs4 import BeautifulSoup
/ n" ~: f" V& K+ w0 M: U7 E
from re import escape
& E+ D% r z" N; e0 x
' j+ n8 c7 O" F; G
if __name__ == '__main__':
" G3 i* K( D* g
conn = sqlite3.connect('Python.db')
5 u: \: Q+ n/ k0 S, ~6 T
c = conn.cursor()
( S: Z% A* ?$ c0 Z) y$ t# n
c.execute('''CREATE TABLE IF NOT EXISTS Python (
6 ?, {' Y9 T, \; D
Url VARCHAR,
/ k; B6 ? H+ b9 x" M% `' y6 }* a
Title VARCHAR,
) @2 h0 ]6 P0 e/ L& y5 `
Author VARCHAR
$ h& f. h P/ Q1 {
)''')
! a* ?" G# e- g0 m/ C0 }5 _
conn.commit()
) V+ q' [8 w6 u' l3 d
: o$ Q& m, }# e# b+ k$ I
# --------------------Split Line--------------------
( Q* a- s8 S! Q! q
headers = {
; U: \& q; B2 |, Z5 O9 |! i4 ]
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
2 b, j3 C5 m. @5 _: s9 f1 _1 ]
}
3 F* ~# a: m$ W" a, E5 t, R
# l7 |: o M+ U
for i in range(1, 1046):
$ u5 F2 d/ ]1 h6 r/ e
url = "http://xxx/index_%s.html" % str(i)
! Y+ G& B" ?7 U0 \
req = requests.get(url=url, headers=headers)
$ f* Y& e* N$ U
req.encoding = "utf-8"
5 D( K+ B: k Q$ l
html = BeautifulSoup(req.text, "lxml")
D: o- ]& l i: B6 x* w, |; @" I
! d) v" Y/ h2 l; `% G0 y
# --------------------Split Line--------------------
- ?$ ]2 |+ i/ Z: |0 _* k! y1 z
for div in html.find_all('div', class_='loop'):
: ?9 ]* V# S: W7 H# B
content_body = div.select('h2 > a')[0]
+ D3 K' q" s4 l \
content_infor = div.select('.content_infor > span:nth-child(3)')[0]
) F) Z& f0 L" J0 t, p* t& ?/ i
7 x7 C5 J6 F [- o9 u* z
# --------------------Split Line--------------------
% ?. G' i8 N- W3 K
cursor = c.execute(
$ w; x- e. v6 R" }5 b
"SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))
1 I, f' k7 c8 [: `: c9 y
len = 0
$ P, c$ O2 E; d8 j8 B' \
for row in cursor:
. `& [" r9 Z; f7 x% S, N9 v
len = row[0]
s! m. d7 z1 Q) D' I
if len > 0:
1 [! x7 U9 J# x4 b# `
continue
. r+ o2 F2 e0 d& Q9 P8 w
' G+ J! f" v% o" S' _: o
# --------------------Split Line--------------------
* B6 G+ v/ j9 U, U0 n2 p2 W0 ?4 G9 K
c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
; t4 {! `% g, j
"http://xxx" + content_body.get('href'),
, ~3 S7 f+ X0 U% L- Y+ k9 n7 Y
escape(content_body.get('title').replace("\"", "\"\"")),
3 d) T9 n" C {) j
content_infor.text.replace('xxx: ', '')))
9 w- o% O5 k x: L
$ l# o9 ]( r4 |& G# J3 h- C
conn.commit()
# @5 i: V/ Z% I8 _; x* E
print("第%s页" % str(i))
# p7 f6 H: T$ @% Z1 M. n# r
$ H) g8 Z4 k7 x
# --------------------Split Line--------------------
! f' |" s% [5 J* }. r; R9 F* {
conn.close()
" k7 B2 g: W3 |! t% k; c6 ^7 H
[/code]
$ W, A8 D& S A* [$ a
$ p2 _0 ]9 [ z0 W
转发自
派生社区
Python交流群:1047602540
# y& p0 l) [* ?9 N+ A' w3 i' @; z7 V
欢迎光临 数学建模社区-数学中国 (http://www.madio.net/)
Powered by Discuz! X2.5