数学建模社区-数学中国

标题: Python-网页爬虫与Sqlite3 [打印本页]

作者: 檀俾九    时间: 2021-1-14 08:44
标题: Python-网页爬虫与Sqlite3
[code=python]#!/usr/bin/python
+ M2 n$ K+ O3 C* |8 D# -*- coding: utf-8 -*-
3 q% |- K: x, j4 x# w/ C+ G* r3 E4 b# ^! T! c2 c' _$ [
import sqlite3/ u" [& k4 T+ Y6 ^" R
import requests. O8 ^* E# R' G) [2 H/ Y+ K
from bs4 import BeautifulSoup
! @* M: _+ z, f. O3 Z4 Jfrom re import escape* N4 G1 h+ }# ?( Z; m; @+ S

0 w3 ]+ t4 Z, A7 Z2 ~if __name__ == '__main__':9 A8 t5 F( |+ N/ v5 }6 w6 C% v, F
    conn = sqlite3.connect('Python.db')+ t6 P" i. u9 H. b4 E2 f# c4 j
    c = conn.cursor(), M- n! Y0 s2 V
    c.execute('''CREATE TABLE IF NOT EXISTS Python (- ^0 a* i# ^4 @2 z: Y# E" ?
        Url VARCHAR,
( A! ?; O: x; e) _. w/ \5 H        Title VARCHAR,
3 w* [+ m  q+ ^% H        Author VARCHAR; S3 k. m$ p4 j" a8 G" i' z9 I
    )''')
$ O' n% e( F4 `; \: ^    conn.commit()6 E" f1 c" ^6 h' G. Z1 @& L( |" Z
0 \0 O0 d6 N. T) t; ?& ~
    # --------------------Split Line--------------------
+ F% A' q) N; Y' K* f' E    headers = {
, Z: d- z* f4 }, Y0 `' _% t2 L        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"* P/ j- P( Q, N
    }
* K, r5 L7 m3 M6 C+ y$ m
) r+ a% E# t8 i" d    for i in range(1, 1046):' g% M  j$ j' ~( q
        url = "http://xxx/index_%s.html" % str(i)
; |/ b( s# T+ u+ g# P        req = requests.get(url=url, headers=headers)& ~/ W: H6 J0 `7 U3 G
        req.encoding = "utf-8"
4 H0 V5 F8 a% p        html = BeautifulSoup(req.text, "lxml")6 o- s4 T  _3 e
4 \: G) B+ O- l/ w9 v! S  L' O" ]. y
        # --------------------Split Line--------------------% t, N% k4 a0 \, n" o
        for div in html.find_all('div', class_='loop'):; X  B- j" j( U* T- g4 j4 _2 ?
            content_body = div.select('h2 > a')[0]7 m1 m1 n  W! U2 q/ G
            content_infor = div.select('.content_infor > span:nth-child(3)')[0]
7 G/ {7 H& }, j) L2 F
/ Z  z, V9 C6 A4 e' G. d            # --------------------Split Line--------------------
& T7 [% m  G$ b( m6 v            cursor = c.execute(7 a1 g+ r0 ^( i
                "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))
% L- z* Q* w( P* L6 w            len = 0
1 o* v! n& x8 `& Q5 i+ w            for row in cursor:9 q4 o# l7 l2 t& i. n
                len = row[0]0 @/ Z5 {  J, I
            if len > 0:
% d& k" g5 j% K  F7 I/ l                continue
3 \) S0 g; k# K  i7 f: E6 Y
6 ^  b& `/ i4 y            # --------------------Split Line--------------------
+ z+ W5 L: ^+ `2 J. {  O            c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
0 l+ {( h$ H  j                "http://xxx" + content_body.get('href'),# O3 |9 Q5 c$ t6 U6 B7 X4 r+ b
                escape(content_body.get('title').replace("\"", "\"\"")),3 }1 J. I, a9 S6 g  D$ J( s9 `* A9 K
                content_infor.text.replace('xxx: ', '')))
; q; w7 Y# ~" {( f( q5 k
+ k0 V2 n' j8 }1 ^# |6 X3 E6 |        conn.commit()& w! b! J: G& P) Y6 v' z
        print("第%s页" % str(i))
0 E/ X. s  h& z# J" ]; M( J9 a( w. B
    # --------------------Split Line--------------------
  A  `! N( S, T$ s( E% {    conn.close()
+ }2 Q' F+ t+ W5 p& o: A[/code]
( r0 X" f( I4 M7 C# V: `8 l6 Q' t. b1 f/ {
转发自派生社区
Python交流群:1047602540

* H/ m* c% p, v




欢迎光临 数学建模社区-数学中国 (http://www.madio.net/) Powered by Discuz! X2.5