数学建模社区-数学中国
标题:
Python-网页爬虫与Sqlite3
[打印本页]
作者:
檀俾九
时间:
2021-1-14 08:44
标题:
Python-网页爬虫与Sqlite3
[code=python]#!/usr/bin/python
: d4 @5 e* {7 K0 ]. x
# -*- coding: utf-8 -*-
1 d6 G: I) x% Z# O3 t6 r
|9 n1 u# ~6 a# h$ f
import sqlite3
& h- ^2 k4 S3 Q# {$ V- `6 R7 s. |
import requests
( G( H m$ Q9 d' m# w3 a
from bs4 import BeautifulSoup
7 V' l9 I8 x r5 \: k$ D* k
from re import escape
' I5 O0 ~5 B* \" `
# [' `& x) Q6 B
if __name__ == '__main__':
5 ~ f3 u1 K9 V9 s( L" L' |
conn = sqlite3.connect('Python.db')
% n6 L: k8 p( j& J9 @
c = conn.cursor()
: ^) Y1 `2 V4 n
c.execute('''CREATE TABLE IF NOT EXISTS Python (
- l7 U$ ~2 G$ h0 a0 u& ]
Url VARCHAR,
# K% \( C0 L$ y* c- n+ g9 f+ ?. q
Title VARCHAR,
* {; P6 e7 X) e) a/ }- O
Author VARCHAR
3 z; o4 p' ^. n9 J
)''')
* h/ p* y! K( y7 f
conn.commit()
. W% f4 V# z e
2 R0 h& q+ \6 h' D( a
# --------------------Split Line--------------------
% z! w4 R$ d1 r; s$ u0 Z
headers = {
9 @9 B3 w" n! \. d
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
6 Y3 `/ s% l3 E* X; d0 ?+ v$ {6 ~
}
( T$ O7 r* i6 y g; }
6 ]' f3 u7 f3 d- h3 Q# @- h
for i in range(1, 1046):
( y2 ~. m3 e% j$ h7 [
url = "http://xxx/index_%s.html" % str(i)
- ~4 z& N8 M9 W/ }
req = requests.get(url=url, headers=headers)
) z* ?& b( y" e+ s' }5 \
req.encoding = "utf-8"
& z8 W1 `! S4 V
html = BeautifulSoup(req.text, "lxml")
$ M8 w8 F3 y* y$ t8 R; P" ]
7 l6 d A V+ B Q, b( F; r" s
# --------------------Split Line--------------------
" V* L1 U4 y' `: u: p. Q. L S
for div in html.find_all('div', class_='loop'):
# L8 }4 S% G. Q7 S
content_body = div.select('h2 > a')[0]
& u* z( X6 [8 q5 J/ ?# M8 ?: a% {
content_infor = div.select('.content_infor > span:nth-child(3)')[0]
/ A, v0 ^8 {) T% z2 l
2 ]) J. B8 I# B: V# A
# --------------------Split Line--------------------
9 @6 p. M' A# _9 |2 q I+ k% f
cursor = c.execute(
8 n# B) ^1 o( p& `
"SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))
3 X* c$ G( q8 a6 y* P9 J/ ~
len = 0
/ R1 i+ u% f2 C" O3 k- K
for row in cursor:
8 q. v& }0 ^/ Q: L* q
len = row[0]
9 x$ p! z# d/ n( D
if len > 0:
( d# p8 g6 p2 e4 U& t- J( U; a9 e
continue
9 Y( L9 K! @1 ?& l" u- ]
7 A1 m! `. _! T2 V7 a5 \& q9 m
# --------------------Split Line--------------------
5 i7 i; k6 ~- j4 y4 P. j
c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
: _( m, V9 S9 ]7 w8 b, s
"http://xxx" + content_body.get('href'),
' w3 Y* h9 E5 R" H3 V% s
escape(content_body.get('title').replace("\"", "\"\"")),
) F: m8 k6 A' i @* h6 w
content_infor.text.replace('xxx: ', '')))
: @! M, X" F. k I' g
' Q* M# r1 }& w# x, _" }4 _) y# y; S
conn.commit()
- d5 L9 K4 n7 l* M d
print("第%s页" % str(i))
3 z* U, e0 L8 y& ^0 u" |
3 K, F. w6 m4 P0 O7 x) s3 p9 G" F, u
# --------------------Split Line--------------------
; y# [4 x( ]2 [! |& L D1 R0 X
conn.close()
( p, i- {- `/ ^: w2 i
[/code]
, D6 V1 E7 _- |8 f) x. `- _3 l2 r" R
p" Y- b. f( T# ]5 l7 b' A7 {
转发自
派生社区
Python交流群:1047602540
- k" l [9 r* P3 \; z0 N
欢迎光临 数学建模社区-数学中国 (http://www.madio.net/)
Powered by Discuz! X2.5