数学建模社区-数学中国
标题:
Python-网页爬虫与Sqlite3
[打印本页]
作者:
檀俾九
时间:
2021-1-14 08:44
标题:
Python-网页爬虫与Sqlite3
[code=python]#!/usr/bin/python
+ M2 n$ K+ O3 C* |8 D
# -*- coding: utf-8 -*-
3 q% |- K: x, j4 x# w/ C
+ G* r3 E4 b# ^! T! c2 c' _$ [
import sqlite3
/ u" [& k4 T+ Y6 ^" R
import requests
. O8 ^* E# R' G) [2 H/ Y+ K
from bs4 import BeautifulSoup
! @* M: _+ z, f. O3 Z4 J
from re import escape
* N4 G1 h+ }# ?( Z; m; @+ S
0 w3 ]+ t4 Z, A7 Z2 ~
if __name__ == '__main__':
9 A8 t5 F( |+ N/ v5 }6 w6 C% v, F
conn = sqlite3.connect('Python.db')
+ t6 P" i. u9 H. b4 E2 f# c4 j
c = conn.cursor()
, M- n! Y0 s2 V
c.execute('''CREATE TABLE IF NOT EXISTS Python (
- ^0 a* i# ^4 @2 z: Y# E" ?
Url VARCHAR,
( A! ?; O: x; e) _. w/ \5 H
Title VARCHAR,
3 w* [+ m q+ ^% H
Author VARCHAR
; S3 k. m$ p4 j" a8 G" i' z9 I
)''')
$ O' n% e( F4 `; \: ^
conn.commit()
6 E" f1 c" ^6 h' G. Z1 @& L( |" Z
0 \0 O0 d6 N. T) t; ?& ~
# --------------------Split Line--------------------
+ F% A' q) N; Y' K* f' E
headers = {
, Z: d- z* f4 }, Y0 `' _% t2 L
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
* P/ j- P( Q, N
}
* K, r5 L7 m3 M6 C+ y$ m
) r+ a% E# t8 i" d
for i in range(1, 1046):
' g% M j$ j' ~( q
url = "http://xxx/index_%s.html" % str(i)
; |/ b( s# T+ u+ g# P
req = requests.get(url=url, headers=headers)
& ~/ W: H6 J0 `7 U3 G
req.encoding = "utf-8"
4 H0 V5 F8 a% p
html = BeautifulSoup(req.text, "lxml")
6 o- s4 T _3 e
4 \: G) B+ O- l/ w9 v! S L' O" ]. y
# --------------------Split Line--------------------
% t, N% k4 a0 \, n" o
for div in html.find_all('div', class_='loop'):
; X B- j" j( U* T- g4 j4 _2 ?
content_body = div.select('h2 > a')[0]
7 m1 m1 n W! U2 q/ G
content_infor = div.select('.content_infor > span:nth-child(3)')[0]
7 G/ {7 H& }, j) L2 F
/ Z z, V9 C6 A4 e' G. d
# --------------------Split Line--------------------
& T7 [% m G$ b( m6 v
cursor = c.execute(
7 a1 g+ r0 ^( i
"SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))
% L- z* Q* w( P* L6 w
len = 0
1 o* v! n& x8 `& Q5 i+ w
for row in cursor:
9 q4 o# l7 l2 t& i. n
len = row[0]
0 @/ Z5 { J, I
if len > 0:
% d& k" g5 j% K F7 I/ l
continue
3 \) S0 g; k# K i7 f: E6 Y
6 ^ b& `/ i4 y
# --------------------Split Line--------------------
+ z+ W5 L: ^+ `2 J. { O
c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
0 l+ {( h$ H j
"http://xxx" + content_body.get('href'),
# O3 |9 Q5 c$ t6 U6 B7 X4 r+ b
escape(content_body.get('title').replace("\"", "\"\"")),
3 }1 J. I, a9 S6 g D$ J( s9 `* A9 K
content_infor.text.replace('xxx: ', '')))
; q; w7 Y# ~" {( f( q5 k
+ k0 V2 n' j8 }1 ^# |6 X3 E6 |
conn.commit()
& w! b! J: G& P) Y6 v' z
print("第%s页" % str(i))
0 E/ X. s h& z# J
" ]; M( J9 a( w. B
# --------------------Split Line--------------------
A `! N( S, T$ s( E% {
conn.close()
+ }2 Q' F+ t+ W5 p& o: A
[/code]
( r0 X" f( I4 M7 C# V: `
8 l6 Q' t. b1 f/ {
转发自
派生社区
Python交流群:1047602540
* H/ m* c% p, v
欢迎光临 数学建模社区-数学中国 (http://www.madio.net/)
Powered by Discuz! X2.5