- 在线时间
- 479 小时
- 最后登录
- 2026-4-17
- 注册时间
- 2023-7-11
- 听众数
- 4
- 收听数
- 0
- 能力
- 0 分
- 体力
- 7790 点
- 威望
- 0 点
- 阅读权限
- 255
- 积分
- 2923
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 1171
- 主题
- 1186
- 精华
- 0
- 分享
- 0
- 好友
- 1
该用户从未签到
 |
#!/usr/bin/env python0 J+ n2 m: H! Z6 Y' k7 v
# -*- coding:utf-8 -*-. j( I! e' v, }( a8 C1 i- {4 v
"""
- v+ ~! a% z1 ~, S* gPython爬虫,抓取一卡通相关企业信息0 J: W3 d: f! A! Q
Anthor: yangyongzhen2 c* i. k% A7 p
Version: 0.0.2' r5 e, Q4 N) y. r; A1 J
Date: 2014-12-14
r; t, w9 Y8 L* d7 `$ pLanguage: Python2.7.5
' }, V" }5 f, ^9 T. WEditor: Sublime Text2/ M, M) z1 ^/ q0 w6 O
"""
$ l- I& V: m( r$ u- F
; m! H5 k9 h; [$ A2 ]. jimport urllib2, re, string/ f0 g$ C4 S" o# _9 j) I4 w6 x
import threading, Queue, time/ X, i, t8 E+ u% o3 e1 P
import sys
- N* i2 F' }0 @% ?# Zimport os* y+ y! I2 w8 b- j
from bs4 import BeautifulSoup
1 T, Z4 ^3 ^: G( L8 ?#from pprint import pprint
2 n$ j1 s6 ?0 E7 Y' u
' b2 N, [3 P0 a9 X. E' g8 r8 S- Y0 Hreload(sys)# |+ Q! G. t" O$ c) I
sys.setdefaultencoding('utf8')1 b* }) K; e0 y# w
_DATA = []6 Y1 X/ [" t' x0 z6 D7 `. i0 f
FILE_LOCK = threading.Lock()
5 q; d( f. a! q- U' h6 KSHARE_Q = Queue.Queue() #构造一个不限制大小的的队列+ }8 N6 ~* t9 X J X
_WORKER_THREAD_NUM = 3 #设置线程的个数* F! e; d. A7 G5 Y% }1 {/ m% ^
5 _5 h3 E6 c# J x
_Num = 0 #总条数7 Y r$ t) H* t1 ]# i
class MyThread(threading.Thread) :0 o5 A3 \( {. y1 N
' O. Y. Y; b7 X7 p S9 d5 V
def __init__(self, func,num) :
8 O5 T& N5 J9 B0 X2 M/ j+ r" o super(MyThread, self).__init__() #调用父类的构造函数5 W! t- }% x7 Y/ ~- \9 P% r [
self.func = func #传入线程函数逻辑
" {' ?4 G% V! X, N self.thread_num = num
/ k' \2 W, r& j" ^2 C def run(self) :
/ @( F$ O$ h( q9 l, M, l) N self.func()
* [ K6 a% B2 E( X #print u'线程ID:',self.thread_num* F4 n+ n, s* o, ^$ _7 ^) A3 w
i3 G* d" ~" V9 K' d1 t0 |4 vdef worker() :
4 F8 U9 L+ q4 l& X global SHARE_Q
+ c% n" m K0 @' M V while not SHARE_Q.empty():# l. ~, G# H6 ^% I; @1 A; L: ]
url = SHARE_Q.get() #获得任务& @2 X+ B2 w$ O7 }. I
my_page = get_page(url); o2 V+ h% b# ]4 r3 d, Y- m( i
find_data(my_page) #获得当前页面的数据& s) e' o, T! a& D
#write_into_file(temp_data)3 X; c- k. w) r2 N+ I0 J& F8 a
time.sleep(1); f( M% F8 g) M
SHARE_Q.task_done()( M% S/ z2 T& S7 A5 s1 u
$ y4 o+ {5 g$ X" X$ Z7 s* q7 V; k& Sdef get_page(url) :& x- n. D& X- w, {1 m9 O
"""7 i3 Q$ Z7 S" x8 o( o' W
根据所给的url爬取网页HTML: D7 H/ I0 @& F- |
Args: ; C3 q2 q7 I' E; R+ w! ^
url: 表示当前要爬取页面的url
- @4 {2 f: m7 b Returns:
0 \" e* [8 e/ ]& M5 h3 ^- J" _6 d 返回抓取到整个页面的HTML(unicode编码)
# X3 ~, w, m, W Raises:& k( K5 U: l1 H0 B) Z
URLError:url引发的异常
0 b8 m, B8 ~: o& r8 z """7 G& |5 h) J) c& C0 d+ E# l
try :
8 S2 z9 a4 ^: @3 |! c9 D html = urllib2.urlopen(url).read()( O: s! d* I: g8 \( w
my_page = html.decode("gbk",'ignore')
( \! u3 C4 ~5 p. y8 r p. d6 y #my_page = unicode(html,'utf-8','ignore').encode('utf-8','ignore')6 n# f! U# \) Z* D
#my_page = urllib2.urlopen(url).read().decode("utf8")
. o1 M* {* ?$ }+ A except urllib2.URLError, e :& ]' B4 ? O7 Z& D) O5 h& X1 Z9 V
if hasattr(e, "code"):; C5 l" X' Q) \" n
print "The server couldn't fulfill the request."( H1 D% S' @; F1 X4 N/ T% h% p
print "Error code: %s" % e.code3 H% p" i; O, j( S5 W* E
elif hasattr(e, "reason"):
# ^3 }. \2 f# |- k7 Z& V; N2 Y print "We failed to reach a server. Please check your url and read the Reason"
) O3 Q& u' Y- c print "Reason: %s" % e.reason
1 m5 Y( s$ _0 U4 e return my_page
6 J3 k- p0 v3 b7 o1 D& w( e+ W) _ f8 k% F
def find_data(my_page) :
* P) o/ |' z9 ^9 k, X9 q# X """
" f) r4 k. f2 j 通过返回的整个网页HTML, 正则匹配名称# t# z* _0 L2 A8 O- V
# W) o; B O% P( ^ Args:
. B# I; t. i; q- t* v$ G# |4 P my_page: 传入页面的HTML文本用于正则匹配; x9 F; f Q8 _' e% T, V
"""
( A" L, R- h, A7 j& w4 b4 l global _Num0 q y# h4 e) P, f, p' n
temp_data = []6 C; ~5 S$ E( d' k ]! `0 r
items = BeautifulSoup(my_page).find_all("div", style="width:96%;margin:10px;border-bottom:1px #CCC dashed;padding-bottom:10px;")& D) U W( b0 @% K. R
for index, item in enumerate(items) :
2 H$ R' O9 E8 t8 b/ ?2 A #print item
5 d ^" `, ^% l4 U; B* O- v #print item.h1! F$ g$ |0 S( M4 v( N/ Y( |# N* J
#print h.group()
/ j. I5 l: g8 R0 [/ w4 o) q+ z% Z #temp_data.append(item)
( v5 I: O& t6 P( s+ d #print item.find(re.compile("^a"))
6 M- q" x% V' d) a: ?0 s7 [ href = item.find(re.compile("^a")). ^/ Y/ F: j: u1 }0 f
#soup = BeautifulSoup(item)
: p, K4 C& T% u4 Q) P) o #公司名称& B: B, `4 |6 y9 I1 g5 x2 c" H
if item.a:
" }' }5 l6 ^& g, b- K* L+ m* { data = item.a.string.encode("gbk","ignore")( H& h; `: V5 N$ B6 q/ g7 N, |: C% S4 r, c
print data! ]- _0 u9 \3 k i3 T& ~$ l- Y4 Y' j0 b
temp_data.append(data)
# s; S9 x9 M0 [- m! N) I I; t; ~0 s& h. E2 v
goods = item.find_all("div", style="font-size:12px;")
9 r5 e' |8 O1 W 8 M/ g& C T3 n/ `" B
#经营产品与联系方式
# ]* i4 o4 I, U6 u for i in goods:3 B5 t1 E% q/ E6 ?: [& w
data = i.get_text().encode("gbk","ignore")! ?) d" w. b" C: N
temp_data.append(data)
$ N/ i) d/ R( e# A" D! c; G print data
3 R2 s: f# l( ~) `, V4 t. C #b = item.find_all("b")1 u4 Q9 y6 B: o) Y. H
#print b
# a9 D0 B0 _+ r9 r$ I #链接地址
4 [/ n: O' o" p( M# g. @' k pat = re.compile(r'href="([^"]*)"')3 V* _+ |, t0 B
h = pat.search(str(item))
) V+ I& a7 t1 Y5 n if h:/ N5 _7 K* b. K. q- i) b2 ]/ g
#print h.group(0)
/ ~5 Y+ F1 i6 {# F; P( q7 Q href = h.group(1)
' h2 O9 q' _ d3 Q) O( o print href5 B( ?# ?; v' b- o
temp_data.append(h.group(1))
% ~7 o, f' }0 V3 k
3 S. n" O- F3 j* j& E& Y _Num += 1
" w- c, b3 ~7 x& t, Q #b = item.find_all(text=re.compile("Dormouse"))
$ D' Y- r( V7 K #pprint(goods)7 r& ?' h/ a7 j) R0 B
#print href- P- O9 N3 x: D; B/ t2 n
#pat = re.compile(r'title="([^"]*)"')
% z2 Z0 g3 b6 y- ]' @ #h = pat.search(str(href))
4 G/ I6 n' v- y0 p" D8 E. R #if h:
/ z& e6 x& A: h #print h.group(1)/ N/ u3 D+ ` }7 x! B
#temp_data.append(h.group(1))
8 h7 m4 x/ q3 _- j0 ?7 M3 s _DATA.append(temp_data)
7 Z! E4 c4 M! p0 r# y
+ M; z+ _$ y& ]4 ^0 K#headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)
4 \8 _) H8 t `; l4 g#all_url = 'http://www.mzitu.com/all' ##开始的URL地址
5 b5 L9 l: h6 N; A#start_html = requests.get(all_url, headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释$ Q' }7 D/ ?& _' R" [4 _) x+ N6 x9 ~
#print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text)
# v- A% M0 A/ |* l
2 f$ j$ B$ L9 D, {1 x) Fdef main() :# Y% h0 F% s, C j6 A5 r ?- B, I
global SHARE_Q' U7 f/ T# k0 K: w2 {
threads = []
) P* U, j( B0 e& y1 n% X/ P start = time.clock()4 V6 n6 I( l# w2 A, V
douban_url = "http://company.yktworld.com/comapny_search.asp?page={page}"
: ?' v2 t. W0 [0 O. U. b; d7 c #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务$ q: j4 b3 P. E1 I
for index in xrange(20) : # U* s( u6 e& {+ T
SHARE_Q.put(douban_url.format(page = index * 1))$ I& I/ q) m$ H) E/ @. C
for i in xrange(_WORKER_THREAD_NUM) :- E! a% D! U: Q1 F0 {
thread = MyThread(worker,i)3 f! s, o, t- H6 @. p+ I( E
thread.start() #线程开始处理任务 e6 A: F7 V9 \6 P! e% x8 X' D
3 |. U3 I, k: P- V- a" q8 S threads.append(thread)# i' C7 `9 v7 ` @
for thread in threads :8 Y2 t! i8 c$ E% a) g3 n0 i# |- N6 x
thread.join()
1 A9 b4 O: F7 F8 ^6 d! m# R SHARE_Q.join()
* b* L8 `+ O7 M k i = 02 }& S4 p8 h5 F7 x6 `6 s
with open("down.txt", "w+") as my_file :$ r+ o! n0 P/ z
for page in _DATA :
1 F: T4 p4 v1 a; {3 n# V$ C i += 11 r/ x/ Z S/ |
for name in page:6 z" z3 ]: a8 v( Z1 Q
my_file.write(name + "\n")9 V: l% q6 s# ?# S' a# Q
Q- g: d; v0 x9 G print "Spider Successful!!!") Z8 o! v! n0 m3 @: J4 J( X
end = time.clock()8 l+ I" a* U) _ O
print u'抓取完成!'
) O! D# w7 @; `2 Y9 @- T; Z. M5 b print u'总页数:',i3 k$ A# J5 ~* m( q j) p
print u'总条数:',_Num! M9 b. }% t, u6 R
print u'一共用时:',end-start,u'秒'+ x2 ?5 m. M3 W% q) B. i) ]0 j3 d
9 R: x3 [" J6 O( hif __name__ == '__main__':
* {; l [/ d2 Z A9 y4 J& } main(): K1 Q4 g; }9 F, F3 ]9 l$ z5 x
W% l8 _ b% A
3 f0 H, L5 G3 a- [, o) ` |
zan
|