数学建模社区-数学中国
标题:
爬虫抓取信息
[打印本页]
作者:
2744557306
时间:
2024-3-31 17:18
标题:
爬虫抓取信息
#!/usr/bin/env python
* D3 z* [0 z7 W
# -*- coding:utf-8 -*-
& Y9 A* s1 r" d. R+ o3 T6 {
"""
( X# {: \" n: b1 Q, S0 E
Python爬虫,抓取一卡通相关企业信息
7 U% J3 H* \7 W, {6 Y5 e% I0 T3 q
Anthor: yangyongzhen
8 w! }8 |1 M& m
Version: 0.0.2
* Y+ L) O2 M5 ^( V$ i2 F" Q
Date: 2014-12-14
- n7 Y: @/ o0 V
Language: Python2.7.5
" B2 E3 o! w8 n" Y# N7 @- F! b& T
Editor: Sublime Text2
0 w% V0 O9 T# g) a0 M6 X
"""
+ V. ~7 ^3 U$ K
+ f5 `3 \* s9 q; s+ x
import urllib2, re, string
' h' F# f5 W: x
import threading, Queue, time
/ \) }# ^. y5 N- y" {: L, x
import sys
^* u# [! O- G3 ?; H2 c" c
import os
7 C* p6 \) e+ B' ~0 {+ k* h
from bs4 import BeautifulSoup
, c3 G5 d$ Y$ Q) h7 V
#from pprint import pprint
* Q9 u+ O; W* e. X& Y: n. Z2 d
7 c @; i }) ~! Q+ R5 ^4 |
reload(sys)
: a' K( E9 a7 d' ~4 ]9 b9 d2 }
sys.setdefaultencoding('utf8')
4 v, b* u( v! d0 e& A. A. x! s
_DATA = []
: b* o. \8 ^9 a- K, ?, t
FILE_LOCK = threading.Lock()
; K1 u% B2 R5 o/ o" {8 u% B
SHARE_Q = Queue.Queue() #构造一个不限制大小的的队列
4 I! B. T& T" ?$ S4 X9 L0 _
_WORKER_THREAD_NUM = 3 #设置线程的个数
$ D) N7 T& m4 ]$ V
; k3 Y' V J! @5 |" X' r
_Num = 0 #总条数
9 I$ O! d6 n* J
class MyThread(threading.Thread) :
M, b. J; _; f, T+ P; I" }
7 a- a1 q, K: Z* M; B% V( q1 p2 }
def __init__(self, func,num) :
7 Y: e/ c! o V
super(MyThread, self).__init__() #调用父类的构造函数
! d p6 {! o% R
self.func = func #传入线程函数逻辑
n7 l. t8 r: F$ p* U
self.thread_num = num
! A& t( B0 E: K6 w) q$ R
def run(self) :
7 `$ r5 M7 h* e+ [4 R* s9 {
self.func()
' e3 h+ V4 M- p* t+ a9 s$ j# Y% c
#print u'线程ID:',self.thread_num
1 n& H0 v. L% _4 y# y$ P6 d
6 F( ~) J1 @8 }* D- g! X9 e0 |
def worker() :
3 [* Y, L4 d1 Z' y
global SHARE_Q
" C- C- ~: q2 W' f: r; x% }
while not SHARE_Q.empty():
' j, |% \" s M3 }, o! w
url = SHARE_Q.get() #获得任务
- r" F- b- x( ^: c o( ^ I9 s0 ?
my_page = get_page(url)
% V, d1 I' Y/ Y$ }3 A
find_data(my_page) #获得当前页面的数据
; w7 `! ~- M/ R+ l/ r2 c& v
#write_into_file(temp_data)
8 C) S6 f7 G: e/ ^
time.sleep(1)
% r/ {! h; x% s: W* {0 y
SHARE_Q.task_done()
) F0 w# Q/ q" j% u( V: n
' S( `0 \" o6 D' j& ?3 O5 i' `8 G2 O
def get_page(url) :
( c! J+ X! `0 E! Q; l$ h; i# G$ ]) {0 T5 y
"""
% z9 t6 ~8 j2 E
根据所给的url爬取网页HTML
. C+ H! V2 r8 L$ C& P \- g
Args:
% Q. q- f( \( ?& @( U% Y. E
url: 表示当前要爬取页面的url
5 P- K+ n3 z- B/ r
Returns:
( Y8 A `. S8 l! C8 U% @: R+ [
返回抓取到整个页面的HTML(unicode编码)
/ b8 N+ v& W8 v/ S% T6 z# }
Raises:
. _) i2 e2 D1 J5 \$ K& ?
URLError:url引发的异常
. v/ \3 I h# |3 ~
"""
o% b: @+ N( X# n
try :
% F9 e( N' R; o" {5 x2 K* e$ _
html = urllib2.urlopen(url).read()
& U9 ~) Q" o4 m- e2 g: [, ]( S0 i
my_page = html.decode("gbk",'ignore')
% k5 y4 {2 m. S3 D. R* g/ R
#my_page = unicode(html,'utf-8','ignore').encode('utf-8','ignore')
9 K# z7 n9 L# Y' c" Y
#my_page = urllib2.urlopen(url).read().decode("utf8")
# B9 q2 @: j7 N+ I' ?1 V W) d
except urllib2.URLError, e :
0 s9 c0 }2 ^% ]* F7 {
if hasattr(e, "code"):
1 W+ r( c2 q+ b7 |
print "The server couldn't fulfill the request."
. _" ?2 u, d# V$ j" g: U
print "Error code: %s" % e.code
, k0 |0 u( N6 y% s! o
elif hasattr(e, "reason"):
+ T: H0 r) z: t# D4 M( m! N( m
print "We failed to reach a server. Please check your url and read the Reason"
; ]1 o I: Q/ S/ z' U1 l
print "Reason: %s" % e.reason
- |# @, e( r; ?. g& Q! A
return my_page
$ H( l4 Z: [4 s# a! m
& G8 v. \1 O" [4 ]' K+ }# D
def find_data(my_page) :
2 Q- O [& j' J2 B/ S8 [
"""
: d. B" X& L: @! g: O% a
通过返回的整个网页HTML, 正则匹配名称
$ ?( v! d6 l) J7 u
. T1 I9 s& {$ |; a- A% ]* t8 k
Args:
% j. p( J: K% n5 C9 @; v$ T
my_page: 传入页面的HTML文本用于正则匹配
: c# g" c. r/ w5 J6 y) s
"""
$ O) p( r( [4 G) W& h/ R, J
global _Num
# j- @) p1 P" @* E
temp_data = []
7 |& |( M% M' ^% b# w
items = BeautifulSoup(my_page).find_all("div", style="width:96%;margin:10px;border-bottom:1px #CCC dashed;padding-bottom:10px;")
9 u1 W1 r( Q0 I" K
for index, item in enumerate(items) :
1 B5 e# p1 |3 u/ |* @ K, Z. ]
#print item
) S: P2 e! [2 X
#print item.h1
* S ~. C! p6 r7 |6 K
#print h.group()
1 t" A. T- c- }2 ~( q( Z# W
#temp_data.append(item)
6 @+ {4 C/ E1 P6 C- x
#print item.find(re.compile("^a"))
1 v$ M* N0 D2 i$ q6 h* {7 N
href = item.find(re.compile("^a"))
& O0 K* k8 {4 K! w
#soup = BeautifulSoup(item)
8 T; p7 U8 y4 }+ c5 K
#公司名称
* ?. q: H! n0 h8 Q
if item.a:
+ e( [# k! z: t }* D9 Q
data = item.a.string.encode("gbk","ignore")
7 @; V# n7 x3 H! \3 a6 L
print data
, L* B+ t$ ~- ~: _
temp_data.append(data)
9 u0 i9 d) n0 }& ]
# ]8 i+ R- K/ j
goods = item.find_all("div", style="font-size:12px;")
, K' t) g1 b L* n
2 \2 K. @+ P% D4 n9 v: u2 `
#经营产品与联系方式
3 M4 |3 C6 d3 y/ U+ f2 u# q
for i in goods:
* S/ d. V) ~) F0 y/ M5 w
data = i.get_text().encode("gbk","ignore")
) {6 X' w; e0 y4 N8 |
temp_data.append(data)
8 |9 A3 c5 p; X
print data
5 E3 l5 B1 I, q' p, P
#b = item.find_all("b")
$ e1 U; H/ r' `# p% F) |
#print b
5 S, c3 S, i+ [$ m! z
#链接地址
2 V" u3 |, P& D+ Q: o0 ?
pat = re.compile(r'href="([^"]*)"')
1 q) C, S; m B7 L8 v* J
h = pat.search(str(item))
2 b, V2 W# \$ Z: l8 E
if h:
1 r: ?3 v- z$ K& H7 u$ j7 a+ T; P9 o
#print h.group(0)
" a- j- U6 w; J! Y6 I- A
href = h.group(1)
6 b5 N' b0 K% ^9 a9 M
print href
1 @( h1 I- c+ p5 J
temp_data.append(h.group(1))
2 u3 d" ?9 m ?) Y- u# C
# G7 B. k: M5 X1 l
_Num += 1
. N% {- k8 E. T. D
#b = item.find_all(text=re.compile("Dormouse"))
1 s6 ~5 L" ?8 b- i- {/ e& g& _1 J
#pprint(goods)
/ y. `% X5 n' U& N' b* K) P
#print href
0 w: C5 B7 m. h5 ?+ h' E; D+ ~) c
#pat = re.compile(r'title="([^"]*)"')
* l/ P) R( t% s
#h = pat.search(str(href))
$ u1 D I6 v# C4 h0 d' U# D- I" x5 [
#if h:
8 L9 r; L( _ O; L5 i. n
#print h.group(1)
$ M0 v l4 R7 C. I( _5 x3 {
#temp_data.append(h.group(1))
* e( ~' {3 |3 O+ |' p
_DATA.append(temp_data)
9 n0 y( F% h# r7 C) Y
, r; j8 f5 x& T+ z
#headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)
) u/ F, D0 l; V8 W
#all_url = 'http://www.mzitu.com/all' ##开始的URL地址
3 x5 l% [: U. ?. r# G
#start_html = requests.get(all_url, headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释
: m+ G6 `! ]+ l
#print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text)
5 M7 y2 a, T* z: |5 D/ ?
+ f: y1 k/ U; f' @8 o
def main() :
7 C! X% Y5 S: {* c0 o$ X- H
global SHARE_Q
% l; s2 D3 P- t/ y* Q
threads = []
9 B$ A0 l! N% W# y7 u3 J0 W' B# D
start = time.clock()
) J) y' A( {3 s" k; S
douban_url = "http://company.yktworld.com/comapny_search.asp?page={page}"
& D. v) B( g2 w0 H- V! ?3 }
#向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务
. k8 i" Y* W& a2 p% A
for index in xrange(20) :
% h+ G s c& K1 K+ [9 X7 R
SHARE_Q.put(douban_url.format(page = index * 1))
) B. k' O M7 {
for i in xrange(_WORKER_THREAD_NUM) :
4 o+ X; P9 H" J1 x4 ]
thread = MyThread(worker,i)
" v! r2 E1 U/ y) D$ O
thread.start() #线程开始处理任务
# [: u1 N4 ~' g0 M! ~
9 b; x, @- C0 t! g, [4 O, w
threads.append(thread)
- g- A m5 Q- D* J' A7 O& [! Y/ y
for thread in threads :
" h" }" O: b5 h; [. N3 H
thread.join()
& D( v% T! Y9 d" }: k6 b h
SHARE_Q.join()
6 M3 D9 W0 C" |" x L. _ G5 N
i = 0
1 R7 Q" _6 x8 V9 S5 f9 J9 R) o
with open("down.txt", "w+") as my_file :
4 t0 {, Q2 B( D, M0 Z# U
for page in _DATA :
/ Z- ~$ {4 l% ~- n- t8 u
i += 1
3 o8 s8 w- W# _8 \3 X
for name in page:
. f' A7 [! m/ z, U. I* y
my_file.write(name + "\n")
8 f$ D7 m! H5 }: x- k
# V9 u# n1 J, w2 X# {! n# _8 s
print "Spider Successful!!!"
; V- _% [ Z" b$ }, B# S( D/ m Y
end = time.clock()
* I; k, b y& C a, _' U3 r5 Y
print u'抓取完成!'
7 v' D$ p4 o, D, }# P
print u'总页数:',i
; @9 J1 l4 z% `2 |/ R- ^4 Q
print u'总条数:',_Num
" ]5 m p1 M) w7 X3 z; C
print u'一共用时:',end-start,u'秒'
3 `- k& W( H+ c0 P. R/ W3 I, C
' q: l* d. M0 Y6 j/ d
if __name__ == '__main__':
( ?! q9 U! s2 z6 g: C
main()
7 v3 z# x0 L# x5 Y. s, u2 B
+ m7 q! Y# G1 ~4 U' A \
* _$ z8 R+ B" W, s
欢迎光临 数学建模社区-数学中国 (http://www.madio.net/)
Powered by Discuz! X2.5