数学建模社区-数学中国
标题:
Python-网页爬虫与Sqlite3
[打印本页]
作者:
檀俾九
时间:
2021-1-14 08:44
标题:
Python-网页爬虫与Sqlite3
[code=python]#!/usr/bin/python
' P: i' l: @" F4 v' z8 Q1 [* c" |
# -*- coding: utf-8 -*-
, B! x Z/ b" D9 E( `' a( w: u
; S6 q1 q t9 q" O
import sqlite3
& [! |1 w- Z% |9 o7 |8 R
import requests
2 K2 n, C" H8 h9 z
from bs4 import BeautifulSoup
+ D9 W0 ?* Q2 h. n4 G
from re import escape
# O9 c/ R! P: `9 s% p+ Y. B4 e
% F4 F9 Q( |9 v* ~0 r- Z; _
if __name__ == '__main__':
+ W+ K* j$ x, s- l8 i3 e+ E! h4 [
conn = sqlite3.connect('Python.db')
7 M8 d" m% {. Y1 o6 s9 c7 R- X
c = conn.cursor()
" m; q$ X' K, s) S
c.execute('''CREATE TABLE IF NOT EXISTS Python (
7 [* A0 p' j4 {/ o1 w. }+ E: e; h
Url VARCHAR,
1 j9 y% J- t7 _& J
Title VARCHAR,
4 Q/ W. V0 H$ r. n( h) w$ m, ?" Z& b
Author VARCHAR
: r* p0 I6 V; ?& p! r
)''')
, T( C% Z! a& z8 M: [, g2 y; {
conn.commit()
5 h i6 Z. p6 H' }- X
$ u9 O; m8 {1 W g
# --------------------Split Line--------------------
, X+ m, ^0 z- ?" L
headers = {
1 ]% W8 F, o- ]- P/ M* }9 X3 d
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
) H& M0 d7 N4 V7 c
}
" z. B f& _" p. r
7 J( O; D% |& |: D" h E$ ~2 f
for i in range(1, 1046):
$ j9 p. s1 u( t1 L/ j M
url = "http://xxx/index_%s.html" % str(i)
2 i) Q0 W* H" x. G0 B0 o
req = requests.get(url=url, headers=headers)
6 Z7 m P2 J' `
req.encoding = "utf-8"
], y& N B# I9 x
html = BeautifulSoup(req.text, "lxml")
) {" q2 r1 S7 F d, x r
% r4 J6 p1 b4 t
# --------------------Split Line--------------------
4 y% z9 I" I6 J- j% w `3 M5 E: O
for div in html.find_all('div', class_='loop'):
& C$ a2 p% A0 {3 j
content_body = div.select('h2 > a')[0]
2 M& K: V' C2 Q* G B' }5 p
content_infor = div.select('.content_infor > span:nth-child(3)')[0]
' v$ r" I/ \# q7 Q; [4 c
\. m+ r- E5 P s& O% m
# --------------------Split Line--------------------
* e+ p* n1 Y; P$ @8 t
cursor = c.execute(
; G, _! h* a% a+ N% \
"SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))
5 \6 ]" R2 i9 ]
len = 0
8 h' n$ ~% x4 I- ]6 J! g
for row in cursor:
/ O) `' q4 Z& l4 M1 ^
len = row[0]
/ a+ N* F @, ?$ \ O; |
if len > 0:
y* ^! q1 g6 k* I2 {% |4 Q; N
continue
|: x" T6 F, i; ^; @
3 Q1 @+ l- }1 X2 T
# --------------------Split Line--------------------
5 e# T- c% t. Y7 u! O/ d
c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- n$ X" E( y% v$ w
"http://xxx" + content_body.get('href'),
L' E. T1 A6 w4 y4 P8 n+ _! z/ ~
escape(content_body.get('title').replace("\"", "\"\"")),
I1 o% s1 {. A' Q1 W- d+ u7 _
content_infor.text.replace('xxx: ', '')))
6 c: h, q4 s9 K% h# [
9 j8 a0 U- Y* F: @
conn.commit()
% M$ d5 {! \8 h3 T
print("第%s页" % str(i))
+ y5 p% V" r" \9 Y
2 V9 Q7 g6 m T' N; L
# --------------------Split Line--------------------
( N1 F5 O3 u" R3 X( ]
conn.close()
j6 F" p0 d0 A6 M# h* ?' F9 X
[/code]
! q/ W3 `4 u2 B- S" C$ {' r
1 {- M! f$ w2 P3 f8 {
转发自
派生社区
Python交流群:1047602540
- T' h% h; C( r1 F
欢迎光临 数学建模社区-数学中国 (http://www.madio.net/)
Powered by Discuz! X2.5