8 `' W3 w7 s Q_Num = 0 #总条数5 D. z* n: R# J1 M8 K* I% N
class MyThread(threading.Thread) :) B/ `/ \' ^1 k3 z
5 ?7 ?" s; I+ M, X
def __init__(self, func,num) : ; C; h$ I, b3 H5 Y g; \, O. u super(MyThread, self).__init__() #调用父类的构造函数+ J/ w p3 N* b }2 C" E
self.func = func #传入线程函数逻辑* {6 h! }: a- \, X. H
self.thread_num = num : b# q' }1 h8 }% n
def run(self) : , c2 v7 n# `+ s: R/ [' ^" G; n. O self.func() ; D1 T$ i, {3 `" | #print u'线程ID:',self.thread_num1 {9 F* b- L: b% b! h. L5 W
& ?8 t1 z4 p& @! J! v
def worker() :- y$ D; [' a2 f8 f
global SHARE_Q# ]7 t# j- o3 N. }& b5 t4 t
while not SHARE_Q.empty(): 4 g& f1 A3 y* d+ \# c- s; p/ j: ? url = SHARE_Q.get() #获得任务 % k! z3 O% `- ~. k ?1 G my_page = get_page(url)8 `# v8 p" J# O& z( v, Y
find_data(my_page) #获得当前页面的数据. k; m! W/ R% a' \! s7 l! b# R9 `
#write_into_file(temp_data)- K; r" ?) w$ s- P
time.sleep(1) 0 Z2 H: P1 J* |3 [- `3 C+ j SHARE_Q.task_done() 8 ~1 ]5 l! r( Y5 h; @' ]( o( ~3 Z
def get_page(url) :4 E* |: o& s+ z1 T. y" Y' q' S0 I
""" 3 ~% Q* u# R& O4 D: T 根据所给的url爬取网页HTML - F/ O1 _# T' M" n6 N Args: ' y& B' i4 z$ Q, }: M5 b
url: 表示当前要爬取页面的url 3 F d# y9 O, D- D9 z0 \ Returns: % J8 C( U ?2 t& |5 f n 返回抓取到整个页面的HTML(unicode编码)) V* m) _2 T: u' _7 Z* _% t% z/ ~
Raises: 3 N6 R$ w R# ?5 F& k- h2 D) @ URLError:url引发的异常$ i6 A: E7 |5 u1 |+ w) n' o
"""4 R9 ^1 o) N9 A% y! l
try :3 v* L; L4 M' Z9 j
html = urllib2.urlopen(url).read()7 ^- E4 @0 P2 @' _
my_page = html.decode("gbk",'ignore') 5 Z4 C+ J; ?9 q" Y1 H1 h #my_page = unicode(html,'utf-8','ignore').encode('utf-8','ignore') " W- i6 T. |2 V0 w8 i #my_page = urllib2.urlopen(url).read().decode("utf8")* q, y& G) z2 R" w' W! _8 F
except urllib2.URLError, e : - i# m6 E. w! Q- l' [. @% k4 p3 |7 ? if hasattr(e, "code"):3 [9 A$ O2 _6 T* W }2 F' k
print "The server couldn't fulfill the request."3 V7 k q# N) {/ U0 k3 k% t
print "Error code: %s" % e.code ) u- F- X: z# a elif hasattr(e, "reason"): 0 V5 r7 o, }: l1 z; H4 {- D print "We failed to reach a server. Please check your url and read the Reason"* B% P' _& z9 ~) y) q% Q
print "Reason: %s" % e.reason / @: b3 u9 Z! V return my_page R! B) |4 _" G6 _1 V3 T! i- U% Y" X3 {. D1 T* Z8 v# W+ g) T
def find_data(my_page) :- K2 _& N L. @$ S3 X* i/ Q. w
""" . q' R ^) [1 r, n7 @; X9 D 通过返回的整个网页HTML, 正则匹配名称2 Y: j" \2 X; h8 U8 Z5 D
2 o& T/ u- C7 }; Y
Args: 0 z8 o! {1 B+ L6 v" H1 A" [ my_page: 传入页面的HTML文本用于正则匹配, e6 v9 P) E" J* v' O
""". B0 X- m1 t; G
global _Num ' \8 r1 [9 W5 u" c+ H1 X temp_data = [] / n4 O+ L5 b- F8 t items = BeautifulSoup(my_page).find_all("div", style="width:96%;margin:10px;border-bottom:1px #CCC dashed;padding-bottom:10px;")2 ?* f) k$ |% ]7 @* P
for index, item in enumerate(items) :* E, j0 h* X# C( V' P) l4 L
#print item / H' z1 N4 C% F! f- b! `3 L #print item.h1 ' [) G. E2 B6 W$ _5 Z #print h.group()$ b) J: H* n; r: o9 k3 [
#temp_data.append(item) ( m. X5 @/ ~' z" o0 {6 j #print item.find(re.compile("^a"))4 w9 C& d% p2 {) m! b ^
href = item.find(re.compile("^a")) 2 |+ m0 n% z: S) S. a #soup = BeautifulSoup(item) ; k* R9 O: l9 Y #公司名称 % \8 `0 e6 Q( Q8 n' a if item.a: T9 ^- W; m/ J$ ^1 e1 `
data = item.a.string.encode("gbk","ignore") & i1 P% Y) A1 \- h, Y# P6 G print data2 @6 {! i) m& M
temp_data.append(data) [& q9 U8 M; l7 X' I, p* {
0 T0 }: P7 p8 r) R/ \
goods = item.find_all("div", style="font-size:12px;") % i: \/ |# R; g5 Y! T4 N' c - y! u- i$ j% P$ P: r$ N1 q' l
#经营产品与联系方式& S: \+ H X+ G
for i in goods:0 ]( H+ c. K6 Q+ S& `: V; G
data = i.get_text().encode("gbk","ignore")) c- `3 @, w/ }' ]3 C/ z
temp_data.append(data) : |) H! m) \: ?% ?( }) m print data & G# d5 i- k6 W: M7 H8 {- T9 h1 ~ #b = item.find_all("b")# p: ^8 p6 Z# Q- q* s, u8 y( U
#print b * i5 Q- f$ L3 n+ W3 f- Z6 p #链接地址, ~6 D3 J2 X9 s/ p: R
pat = re.compile(r'href="([^"]*)"')& N' H' f1 \2 k# I
h = pat.search(str(item)) . e. o. a! m2 P) S. t w& R if h:0 K7 O$ u3 `1 B
#print h.group(0) 3 b1 T( f# u/ e9 E* \ href = h.group(1) & M L+ H9 Z `2 Y' Z, {$ B& Q, C print href" v' A. P) C( K2 a& ?) m
temp_data.append(h.group(1))6 L; }- n/ @' O/ _- k2 H" a9 R
/ M. @* @1 z5 T, X% k+ ^% i _Num += 1! u- u9 N2 h" U$ v
#b = item.find_all(text=re.compile("Dormouse")) 2 [ n$ x5 Q4 o' _- t- `7 I #pprint(goods) 6 ^: R, `* n8 u! y, f #print href # o' k' r; Q' B9 h #pat = re.compile(r'title="([^"]*)"')& i4 w$ h- D; c2 N# n
#h = pat.search(str(href)), \3 U7 Q9 x& e, D) d
#if h: # V. W: `8 b5 A6 L8 R0 k7 r #print h.group(1) 4 x6 k+ a5 B& [: S- v+ @ #temp_data.append(h.group(1)) ' {! i4 l4 m" I, O; E7 ^. P _DATA.append(temp_data) 2 Z. {0 u# H3 ^* h . y9 y1 ^% ^ G) E4 p- W#headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)7 C! P8 e |& u+ ] x
#all_url = 'http://www.mzitu.com/all' ##开始的URL地址 }+ [( j" C" t z# i3 H) m9 S
#start_html = requests.get(all_url, headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释 4 r8 _- I1 \# c9 ^- O/ _0 C1 [+ w#print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text) / {2 Z1 Y( y% u: E, X, j- _: D$ ` - S4 g' {, z/ n) U0 A$ ?: }7 \8 q; pdef main() :7 p% w: i- ~8 m& r$ G
global SHARE_Q n8 I: l: w+ i) V
threads = []) W+ X+ c+ ^" U, s! X9 f' g" S
start = time.clock()& E% `/ m* [" B$ W, f; T& L
douban_url = "http://company.yktworld.com/comapny_search.asp?page={page}" 0 ~* d2 ~( r# O8 ? #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务 / y+ l* G* H8 S: `9 C/ d q for index in xrange(20) : 5 s7 J1 r+ Y/ I1 y+ G) ? SHARE_Q.put(douban_url.format(page = index * 1)) ( [7 `+ l! S3 G( _7 ^9 o for i in xrange(_WORKER_THREAD_NUM) : 6 A; R- F! s m, @. R8 T9 [2 U thread = MyThread(worker,i) 3 \# l. E) g" i1 l5 }6 W thread.start() #线程开始处理任务" g9 d9 t5 l7 }9 N% `2 p0 D
# o: k# I+ c: e; M% L+ K, D threads.append(thread) 8 y) {9 U/ p! M& ]: k for thread in threads :$ f+ }3 S# ^( u: M. I+ o7 f7 A
thread.join()% H6 L2 ^6 C' r
SHARE_Q.join() $ L! j2 i& Y4 k/ f i = 0" w l" d3 F0 e
with open("down.txt", "w+") as my_file :7 B0 [7 I0 Z+ b4 R$ E5 J. }5 }
for page in _DATA :% l8 O W9 F1 f7 M2 e z* m
i += 1 / `* X4 s' r9 N1 R for name in page: ; W9 k% `) V* J# }3 G my_file.write(name + "\n"): p1 E+ A0 E, Y" A# G
! D, j! @4 @# y print "Spider Successful!!!" & x2 R4 e" h. G+ v end = time.clock()" r# _/ C. n3 k3 v: x- t
print u'抓取完成!'& y: [( T3 ^0 B1 `1 L
print u'总页数:',i " O8 U! c$ y6 r0 F) m7 j- H: j/ i print u'总条数:',_Num6 y: K' x2 I" X
print u'一共用时:',end-start,u'秒' 7 _. r2 n3 J' t9 C1 X, y7 j. ~1 R 5 ?% k* D" M* H1 D& k- u7 Yif __name__ == '__main__': ' G' B/ N7 Z$ ?1 s. B. h main(); g6 d( q' O7 Z: P( d0 h; \