#!/usr/bin/env python# g+ a( h2 t+ n, e: n& X7 o
# -*- coding:utf-8 -*-1 D* u2 f/ X' s; l7 q8 ^8 `: {+ o. |
""" 3 V, r/ n% l& aPython爬虫,抓取一卡通相关企业信息 ! E" c$ G) y$ f m' x6 n d/ YAnthor: yangyongzhen $ `& b8 _ ?9 _* E% R* xVersion: 0.0.2* r4 A" r% j! i2 }
Date: 2014-12-14; I/ a- J% t: w6 J( A/ E% \
Language: Python2.7.5: {! W2 G/ y7 y& Z9 ~( ~$ y2 `
Editor: Sublime Text28 ^# s0 R; C" s# @' G# w; L. |
"""& t$ ~. y2 p6 H( k2 P4 c3 Z
) P0 @4 O. X) T, `- }
import urllib2, re, string 7 |6 N$ P) b; i4 a5 jimport threading, Queue, time 0 e* p4 m# ]4 }- \& q) @import sys$ V0 V& T1 Y1 b2 {7 j
import os 7 y/ o) b: b4 A+ ofrom bs4 import BeautifulSoup + g7 _ t9 ~! E9 w3 X/ O#from pprint import pprint 8 a9 E+ x2 m+ H- X ; y0 c" L- ~ y- Preload(sys) 8 ?! v5 k" b! e+ a O% nsys.setdefaultencoding('utf8')+ b9 [' v! N* e* c1 a
_DATA = [] : k' O4 K7 `/ E' @$ q: VFILE_LOCK = threading.Lock() - j/ l0 y) w) v7 s5 q. PSHARE_Q = Queue.Queue() #构造一个不限制大小的的队列 ( C' g: c; n. X' C_WORKER_THREAD_NUM = 3 #设置线程的个数 8 K0 w$ @; L% x5 O) v8 } |; I1 J4 T3 d; f. o7 w
_Num = 0 #总条数 9 Y* v% U+ ~# Iclass MyThread(threading.Thread) :% e6 y3 ` S. c( u, B- F! C' H
4 s4 ~" u: v% ?4 x
def __init__(self, func,num) : - N( H# y8 x" a' _/ d* l; Z2 |. r5 r super(MyThread, self).__init__() #调用父类的构造函数! X# l6 h5 J' d( T# _. }, w7 D
self.func = func #传入线程函数逻辑9 o: e) Z: s" q- r
self.thread_num = num , Q9 ~& \$ W, h- d* Z( h; X9 g6 v
def run(self) :$ P4 [8 f* ^+ p$ z2 D
self.func()* k" O+ K6 j. b( J" Y
#print u'线程ID:',self.thread_num' c- E( T S! d5 o" U
/ M4 ?* z, `* m8 {( K7 o: Sdef worker() : , n/ K! u$ v7 T7 ]; I7 _5 Q# ^ global SHARE_Q, i0 H* V* Y2 A" G
while not SHARE_Q.empty():$ \; J/ B, T; n/ |7 ?8 j8 |3 {
url = SHARE_Q.get() #获得任务0 e2 A+ w. O+ R: C% g4 @# B
my_page = get_page(url) + W4 z7 r" d R5 t' d, Q. u find_data(my_page) #获得当前页面的数据 3 H N: D- ~- Z' r- U #write_into_file(temp_data) * s9 \* v) [5 X } time.sleep(1)& h1 L; b4 |$ H
SHARE_Q.task_done() + ?; a; F" }1 ?3 e+ y5 ~# E ( u+ g+ ~/ V' t9 odef get_page(url) : 1 {4 `3 B5 u% l6 e3 e* M3 M* ~ """9 {1 [; E4 a' C) k9 b ^5 T& q
根据所给的url爬取网页HTML2 I. | E4 j& v2 C
Args: ! U' p( L* b) C! c( N2 @
url: 表示当前要爬取页面的url' y: e9 m- r8 S
Returns:- B' `6 g/ V) J
返回抓取到整个页面的HTML(unicode编码) 8 _$ i8 [, A$ S4 J* r1 c Raises: . r( W) J' H5 j' C URLError:url引发的异常9 G# o8 S! \& K1 s& f$ Y3 M
""" + b, I3 Y; N# N# u3 A try :+ s6 p. S, i- a8 @7 K3 V
html = urllib2.urlopen(url).read() ) C# }0 N. H |$ p% ^; S my_page = html.decode("gbk",'ignore'), K1 w: J: v- t3 m/ {/ c9 x
#my_page = unicode(html,'utf-8','ignore').encode('utf-8','ignore') , I L$ y5 Q" ^/ k- e& K #my_page = urllib2.urlopen(url).read().decode("utf8") 0 Q$ ~9 P; {$ l- C( v except urllib2.URLError, e : 9 h. A% f5 I' _% A* d0 j+ _ if hasattr(e, "code"): . I2 c/ b. l! g* K" B# p print "The server couldn't fulfill the request." $ d% m+ c3 E! i( X* c6 e- ?( W print "Error code: %s" % e.code $ c* l$ W+ g& e. t& ]" \ elif hasattr(e, "reason"): $ {& I/ O E. H2 b- T print "We failed to reach a server. Please check your url and read the Reason"( F& V% n% s s7 a
print "Reason: %s" % e.reason, ]/ l. F) @% A( ~5 c( n
return my_page; ]( z8 C8 O/ r8 _3 [3 {
0 O4 F0 q1 B* ^0 u7 `8 _def find_data(my_page) : ) E" W& |- w+ E; f& U; i% F """: z9 k5 ?4 P. z l# c+ |0 c4 {2 `6 c; |5 {
通过返回的整个网页HTML, 正则匹配名称 6 l# z+ d% `! C, {, E, l: E- n: V0 S6 A5 a* _6 I" V. n
Args: 3 F4 u- Z+ Z3 c# ~% h n! W my_page: 传入页面的HTML文本用于正则匹配: {) P: Q. K" j3 L9 m
""" % S/ R0 o) P$ x+ J& o+ E global _Num $ t5 K2 w* ?" x temp_data = [] . z( f# @# B( E5 v4 P3 y* H items = BeautifulSoup(my_page).find_all("div", style="width:96%;margin:10px;border-bottom:1px #CCC dashed;padding-bottom:10px;")1 |. _% \2 ~4 V8 ?- s+ w P; J, C7 J+ S
for index, item in enumerate(items) : 4 ]9 B" t8 l: R1 I' G #print item S' I+ c6 \) y3 v |0 F6 k
#print item.h1+ y2 B3 Y* V3 g& ~% ]. Y% { X$ s% x
#print h.group()6 U7 ^4 y' l% b8 F5 p1 z( u% p
#temp_data.append(item)3 Z% w$ z' f) Q' s
#print item.find(re.compile("^a"))' c# ?8 q" C. q2 f0 H
href = item.find(re.compile("^a")) ; V' d, A) O% f* t8 j+ H1 Y #soup = BeautifulSoup(item) - `2 t0 z6 ^0 F9 \/ L. r8 L3 i #公司名称8 q" |4 C7 N: B0 w1 ~) I
if item.a: 8 U8 h' E1 e" f8 A' D data = item.a.string.encode("gbk","ignore")& ?+ N8 f! j2 I6 k6 K! d% T5 X
print data & L& R& |2 [" X1 t5 F temp_data.append(data)4 v! C0 Y: Z% Y w( I! X
( v8 h* T* |" @7 c goods = item.find_all("div", style="font-size:12px;") 1 e. r& c8 b ]* Z4 @% C 1 z! k) X, x+ w. c* }) L #经营产品与联系方式 : n, G$ E3 ~+ a$ S! N for i in goods: 3 h, S/ Q0 r, k+ D: Y* a- m data = i.get_text().encode("gbk","ignore") 0 q h% C$ m2 x$ b1 Y temp_data.append(data) 9 Y# E) {1 O8 n4 O+ C `1 X print data5 {7 w0 f/ n- ~& c- b. p
#b = item.find_all("b")# C2 C0 M$ d0 z2 g7 X( Y6 k& j7 L
#print b ! O n7 a1 c; F( g% ] #链接地址# E4 N: A+ n n( p+ ^ V: s
pat = re.compile(r'href="([^"]*)"') , W" u, A) v" w \* q h = pat.search(str(item)) 9 d* s) i- T5 Z if h: 2 `3 p$ a4 O6 C #print h.group(0). ?$ X. N1 k1 i
href = h.group(1) ( P S$ o/ N0 d$ D9 u( R print href: @0 x9 w' ~3 A. E
temp_data.append(h.group(1)); }8 h6 t4 C3 z1 s% c
( T2 ^7 ^* k. u' a _Num += 1) J k7 A7 e4 i, x0 E- p! n
#b = item.find_all(text=re.compile("Dormouse"))* Y- ^, B/ O; n" ]5 X% E7 W
#pprint(goods) & D- D: {" M6 x9 p" y9 P' p! g #print href & s l4 v+ R( |+ c- H* r #pat = re.compile(r'title="([^"]*)"') * f* R* Z/ w) ]- r; \ #h = pat.search(str(href)): _( d6 f! j! A6 g, Q
#if h: 2 G' ^6 A7 s1 w( ]9 ]% I/ H #print h.group(1) : D0 x( U( O8 o, {& | #temp_data.append(h.group(1)) 6 ^5 J1 y' Q, u0 d _DATA.append(temp_data) & k7 ]7 @3 A& C9 ?) i9 X* d, e& M: l8 C: W: f8 X I" i
#headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)5 c& l+ s' v) H" d- a$ j
#all_url = 'http://www.mzitu.com/all' ##开始的URL地址 / G3 l% D8 q6 h) o#start_html = requests.get(all_url, headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释 * S) m! t: \9 V#print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text) # J6 B+ o* M, T6 K' k7 C 3 ?; q. b- b* _% {7 Pdef main() : ( L1 U( ~' y3 b. w global SHARE_Q- y2 u6 c# D- P* D3 q9 S
threads = []3 |$ L! K. E4 B' U- a& A. Y) [
start = time.clock()3 Z" t! p. L$ @
douban_url = "http://company.yktworld.com/comapny_search.asp?page={page}" 4 d- x2 y) E0 E8 x) ^ #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务 # V' B" `- ?0 r, b3 W for index in xrange(20) : ' e2 }9 i, v% w) b+ T4 b6 f% C
SHARE_Q.put(douban_url.format(page = index * 1)) 8 ]1 G. Q E% c4 G for i in xrange(_WORKER_THREAD_NUM) : 7 W K, @, _8 F$ o thread = MyThread(worker,i)( e! {( a2 M5 i. Q1 m" g! j
thread.start() #线程开始处理任务 2 F, E/ g- w! K4 J# e7 v: s5 I+ G1 v1 t2 e' e
threads.append(thread) & J6 R8 M, @1 X; l for thread in threads : 5 X. ~% g# D/ V* D; A# A thread.join()% O7 w0 U9 g4 X ^, \% K& b1 V
SHARE_Q.join() 8 N5 H2 I E$ B! A7 E. M i = 0" N9 p) i6 I) t1 i' `& F
with open("down.txt", "w+") as my_file : - Q6 m9 \4 x8 P$ H) k/ O l* v for page in _DATA :9 L* g/ D0 b4 @! D Q4 w
i += 1 1 W- V$ s5 Q! k6 X9 N' q7 k for name in page:9 k" @0 C% o4 E4 P' [5 ~
my_file.write(name + "\n")7 g# Q# g, T. c
. |1 \/ m. S8 y: K print "Spider Successful!!!"0 s4 u; N/ u* ^6 ]5 ]7 j, F
end = time.clock()4 L% B$ v) ~/ Z* T; h
print u'抓取完成!'+ q6 X* r! y, G* j: c' e) i9 B9 J3 u& x
print u'总页数:',i + ] Q/ k" N8 i+ `5 m print u'总条数:',_Num% f# W/ ] C3 @3 U, r* f: l9 |
print u'一共用时:',end-start,u'秒' : h. T( B$ ~/ n0 m4 T z' t% y 0 a3 f/ _1 X! @) J" Nif __name__ == '__main__':% D% G. ^2 Y/ W& V3 y! Q/ }
main()0 k! j: N3 E: O' ^