- 在线时间
- 479 小时
- 最后登录
- 2026-4-17
- 注册时间
- 2023-7-11
- 听众数
- 4
- 收听数
- 0
- 能力
- 0 分
- 体力
- 7790 点
- 威望
- 0 点
- 阅读权限
- 255
- 积分
- 2923
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 1171
- 主题
- 1186
- 精华
- 0
- 分享
- 0
- 好友
- 1
该用户从未签到
 |
#!/usr/bin/env python1 r: a S% _3 @' ]; n* r
# -*- coding:utf-8 -*-+ V2 ~: e @2 Y, _3 G% u
"""
& @! i/ P2 ?" s: i+ [Python爬虫,抓取一卡通相关企业信息7 t; @9 E& G5 C1 m
Anthor: yangyongzhen
) V9 S/ O7 C' uVersion: 0.0.2
' f# N9 o u" u9 |; M! yDate: 2014-12-14
( f% x7 B: D1 b/ ]) R5 hLanguage: Python2.7.5
. _8 X! _- v6 t6 D+ i0 n4 Y) iEditor: Sublime Text23 l G3 v, R& g/ N
"""
, L, M% O. h0 a+ g( I
* F. q- L J. a1 }) Dimport urllib2, re, string
/ F9 _! a: S' |7 Uimport threading, Queue, time7 Q8 k8 u2 k& u
import sys
) T- ^' \( N# Yimport os
4 h! D% }! J" lfrom bs4 import BeautifulSoup
8 r$ t1 N# S! u |+ I#from pprint import pprint
. f. a& U2 G: F1 G* [! B+ s3 W7 v% Y2 D
reload(sys)
0 g' a9 f' O2 T. ~2 O/ s; }sys.setdefaultencoding('utf8')5 l- D9 W( Y% N1 K( i% P
_DATA = []
* J9 _# N% g8 MFILE_LOCK = threading.Lock()& s/ C! |+ _- ]5 ^1 ]' q
SHARE_Q = Queue.Queue() #构造一个不限制大小的的队列
. f$ I; |5 x0 d5 ^) o- `0 t) T_WORKER_THREAD_NUM = 3 #设置线程的个数, l4 h3 i0 n6 c2 v9 Q, ]
8 y2 k8 `/ a$ Q! y& U+ m4 x% I_Num = 0 #总条数
; K% G, b4 z9 g& Yclass MyThread(threading.Thread) :
& U, i S7 s: j7 y( K
% j0 G- c# Z- [( h* o: h2 o def __init__(self, func,num) :+ u- {* `" Z( |0 c+ p' l0 \
super(MyThread, self).__init__() #调用父类的构造函数& {, N; g* Z+ y) o
self.func = func #传入线程函数逻辑
4 _2 C6 |5 r/ c- U9 P* E* I self.thread_num = num 2 B) @: s6 N0 F
def run(self) :
! K5 p6 {% n0 N self.func()- [3 B! n( H+ ?
#print u'线程ID:',self.thread_num
& p4 V- _( o! g; F/ P ^* ?" y
' t" d. C, T- f9 q; \2 b3 }def worker() :
4 [1 F4 t/ L# @0 {. a global SHARE_Q
4 `" b3 T c- k V while not SHARE_Q.empty():: ?+ k# o9 R( x2 |, m! l
url = SHARE_Q.get() #获得任务
j. r6 S' f" J" s my_page = get_page(url)
2 ~$ n- l1 X. f4 b/ v7 W9 g find_data(my_page) #获得当前页面的数据6 T i1 b) T+ y5 l H
#write_into_file(temp_data)% Q3 ^" Z% E+ N& v
time.sleep(1)3 p4 e/ b" |9 v8 b$ |# G# g8 I
SHARE_Q.task_done()/ k9 D- e: l/ c& b
: \0 k9 u# I; w9 Y! x
def get_page(url) :# [( L2 X# L0 p
"""
6 E" C/ r- H8 r. m 根据所给的url爬取网页HTML
, A& l, g$ m, a/ s3 N Args: 6 z) w) N1 x2 e9 l* |% y: w1 @
url: 表示当前要爬取页面的url7 f/ @6 r. n4 P* g. W
Returns:
, W& E, P: @, o! z; _# y 返回抓取到整个页面的HTML(unicode编码)
/ e/ ]. `1 q' \ Raises:/ G! M$ G- O3 d0 {: b
URLError:url引发的异常
* ]& l/ t8 l$ ?* ~7 t8 K6 N """* e4 ~2 o+ z5 I; T( v" C) x
try :
* m, t: m0 S6 R html = urllib2.urlopen(url).read()
9 x# c" L/ [6 p$ ]- U) i my_page = html.decode("gbk",'ignore')
$ D- ~6 X5 `" S& `& }, e #my_page = unicode(html,'utf-8','ignore').encode('utf-8','ignore')
) O8 P+ a# V3 T& M/ S #my_page = urllib2.urlopen(url).read().decode("utf8")+ M4 }4 [0 ^6 g9 {
except urllib2.URLError, e :
9 X# C. `" O" r: A/ G if hasattr(e, "code"):
5 f/ J u* o8 E" {& x2 H! d p- n print "The server couldn't fulfill the request."0 Z, j! Y( m5 D$ `1 n/ l
print "Error code: %s" % e.code
8 k" _1 r' _5 I) g elif hasattr(e, "reason"):
0 {' l: X( f% K7 r print "We failed to reach a server. Please check your url and read the Reason"5 k' Z, Q) ]) k$ _3 t K" o
print "Reason: %s" % e.reason& x9 z5 M; K" L5 r) V
return my_page
* y: g8 K/ }8 ]9 L c; Y
% d6 {8 Q0 S7 }4 U) b! {& o+ c% fdef find_data(my_page) :) j3 B. H* p/ X. _
"""" L. D v+ q7 |" C) W
通过返回的整个网页HTML, 正则匹配名称; y, [2 m) M9 \8 L9 V/ Q
3 B* d4 T+ a3 E$ c+ C( D% ]9 a Args:
8 g% a) l/ v7 g7 m8 U5 ?5 }' b my_page: 传入页面的HTML文本用于正则匹配
/ V- v+ n/ D, M) D1 z, y """
9 F7 N, t# s) ^" m' e8 ? global _Num: e5 s! h/ n& C5 l: u4 h( j3 [
temp_data = []
% Q; v: ~0 T( S- O7 R items = BeautifulSoup(my_page).find_all("div", style="width:96%;margin:10px;border-bottom:1px #CCC dashed;padding-bottom:10px;")- s# u& M6 t( ]3 s9 F: Y# z) z
for index, item in enumerate(items) :
9 P) C* ^( [. I #print item) O& h3 i0 i. H- |
#print item.h12 O3 P) o( w+ s, p' l/ e7 X' X; |
#print h.group()6 ^2 y1 B9 F* ]$ c
#temp_data.append(item)
3 v7 g1 r5 F" m) g1 f C #print item.find(re.compile("^a"))3 A9 X" s9 b' w! @: g6 i
href = item.find(re.compile("^a"))
4 @$ g' \& [( ~& ]2 X4 [8 | #soup = BeautifulSoup(item). l) M2 }7 C3 v. m( o
#公司名称" q& e# R: K7 \6 I6 c" N ?0 I
if item.a:
- L3 c7 O* l! N0 T# u: n data = item.a.string.encode("gbk","ignore")
3 V7 b$ a! |7 g' \3 l4 d" m print data) o4 P$ f# Y2 l: Q
temp_data.append(data): g; A; H. V& C# z5 O* Z; M
+ t0 [) u; U. | y/ k8 a goods = item.find_all("div", style="font-size:12px;")
! ~5 {* S" E" Z$ V, I
# K" M+ q5 M. K/ @+ T: @: d# N #经营产品与联系方式
4 o4 N* f$ K5 G& z9 U8 O4 V2 z for i in goods: x8 J! n- @! G
data = i.get_text().encode("gbk","ignore")+ ^, o1 c8 c1 e# {9 k9 [7 a
temp_data.append(data)
! x+ U/ a( }& Z' { print data
/ G" H2 J0 @/ L' `+ J1 f #b = item.find_all("b"). W j: L. K+ x, g" U8 Z
#print b+ W+ i$ K6 O3 \4 V7 W: M+ }4 |! v# j
#链接地址
$ a8 {5 L; c: j& i' M pat = re.compile(r'href="([^"]*)"')
2 c" G& }1 g `) D3 q1 b h = pat.search(str(item))7 ]( |( g4 C1 o6 ~3 ?
if h:
) b3 j3 q+ ^, `" _! ?! d6 r2 e w #print h.group(0)- M, A/ J5 ^- W3 `6 H
href = h.group(1)
- ]4 p g* e0 f print href
0 I! f. D* G+ `+ }3 M temp_data.append(h.group(1))& g/ L9 H2 O3 T: p# b* q3 w6 @
4 D- R+ U- e; D' o0 h+ Z _Num += 1
/ Y6 L8 R' B$ |( q! r8 F& G* a #b = item.find_all(text=re.compile("Dormouse"))) `! i" d/ r" u( Y$ S' T' K6 V
#pprint(goods)9 r8 O3 F8 b# \. I5 }; M8 y
#print href
1 U& P; p" B! w8 {* f9 o# { #pat = re.compile(r'title="([^"]*)"')- ]- G% H* R: t+ b' R
#h = pat.search(str(href))" h D; i% ~* c* P) N0 B, L
#if h:
6 Y, ?" @0 q1 q #print h.group(1)7 y! e! l' h- O7 x$ q$ z' ?# K5 ~
#temp_data.append(h.group(1))) X' e a! J: c- w) R
_DATA.append(temp_data)
- v# d! m, L, X2 o/ j8 y$ j! ?
( R$ G [! T* P#headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)
: W& {$ r. I; p& s4 ~6 n0 \! H#all_url = 'http://www.mzitu.com/all' ##开始的URL地址% Q9 }9 K) } _
#start_html = requests.get(all_url, headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释
$ w0 \- {% R2 K' X7 A6 k6 z& ]2 A" T#print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text)
5 b% S# _& q$ \& u$ K. j3 U- A+ k& W3 j) r6 H
def main() :5 e2 f5 R7 C4 k# @
global SHARE_Q
' ]6 J# K8 w5 a5 g7 w1 N) T0 [ threads = []
4 W9 K9 d, a/ [! O$ c* j start = time.clock() `8 A; Z- D0 C) f2 P
douban_url = "http://company.yktworld.com/comapny_search.asp?page={page}"
* l) J7 G- O9 N! f4 g: r7 Y #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务! j1 `/ h0 O+ P; C& t
for index in xrange(20) : 2 C' Q1 @# F$ ]7 O% i9 W
SHARE_Q.put(douban_url.format(page = index * 1))# Z* @0 [. e8 K$ l: S K: n3 u
for i in xrange(_WORKER_THREAD_NUM) :' `( h. B K! x* a) r" M
thread = MyThread(worker,i)
: S, u$ F, w, Q0 Y0 \8 w thread.start() #线程开始处理任务
. n% A* I% \9 r+ K6 U( A% }
& I; C* _7 Q. L4 z) [% Y' ~1 [ threads.append(thread)
2 H! S- g$ z0 P. ?5 ` for thread in threads :9 A4 B0 G- U- X }
thread.join()
" J, B/ H0 j P, n SHARE_Q.join()! P1 V" H8 g3 e2 {5 r! c9 R
i = 0
% s: }% D% L/ }( x' j% Q% |1 E with open("down.txt", "w+") as my_file :+ J% ?" U( E R7 B' A; V
for page in _DATA :/ q0 T! \$ s9 L1 ^% I
i += 1
: ?1 x- V7 I! a' J2 [9 _ for name in page:
* |8 c- j& C: c* V' ^! [/ m my_file.write(name + "\n")
" T* W. Q7 f+ ~
( u; U' Z) C: c3 l. U `7 p print "Spider Successful!!!"
! J/ R* w) k+ r. ?, _& G end = time.clock()3 y0 X$ c5 G2 b( A: C! D! F( c0 W
print u'抓取完成!'
! d5 i( Y. ]( S; G print u'总页数:',i
: c4 E5 Q" r+ A, ?1 y5 M( `! ~ print u'总条数:',_Num) t O4 Y. x7 y: [
print u'一共用时:',end-start,u'秒'& C7 r5 W4 n5 H& U5 @# c' X
% W" V1 q% w: Y0 G( Y; G1 r
if __name__ == '__main__':' S9 T S9 g/ Z( i
main()
) w0 \8 p+ W0 g5 c. a
- m% b) C3 [( L |: J# n9 K5 A
, |* a, R. K, V1 _3 [4 Q# I/ S |
zan
|