- 在线时间
- 479 小时
- 最后登录
- 2026-4-17
- 注册时间
- 2023-7-11
- 听众数
- 4
- 收听数
- 0
- 能力
- 0 分
- 体力
- 7790 点
- 威望
- 0 点
- 阅读权限
- 255
- 积分
- 2923
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 1171
- 主题
- 1186
- 精华
- 0
- 分享
- 0
- 好友
- 1
该用户从未签到
 |
#!/usr/bin/env python- D6 `2 f' z# \ C
# -*- coding:utf-8 -*-
( v; K4 h4 q1 N! f& i. X"""
- X y. @; x9 M+ e6 |Python爬虫,抓取一卡通相关企业信息
: y4 [6 Y8 d" I" Z( d/ JAnthor: yangyongzhen2 X' H) g) x: x
Version: 0.0.27 P; J% J \2 }" f
Date: 2014-12-14
' `+ z8 t& F% {Language: Python2.7.5
* T+ R& K% m/ Z( J. o8 xEditor: Sublime Text27 Q; L2 K' I6 Z- _, A! `
""") P+ B1 C! P# z4 G9 b3 @7 D0 y M& b
( l1 y* O' V% F, simport urllib2, re, string
$ C7 T2 X8 w) a; H& iimport threading, Queue, time
! H6 c: S" f0 `import sys& j" ]5 x, O, h2 _* j3 L5 ^$ d+ D W. M
import os
, O7 J( L+ I# T, z4 h: }+ Gfrom bs4 import BeautifulSoup+ P5 V- O4 I. h" \# ^6 ~
#from pprint import pprint
" r o- s3 Y5 V& Z7 Q& b
J. `: Z5 P% U" `' dreload(sys)8 t$ x4 ^+ [: {3 a4 w. C( D& } t7 P$ }
sys.setdefaultencoding('utf8')7 U6 Z% K7 }' |# D, ]# P1 t
_DATA = [] O- o/ T0 G7 s; L2 B0 [: p8 d9 N
FILE_LOCK = threading.Lock()- T3 b. g, t0 F6 T
SHARE_Q = Queue.Queue() #构造一个不限制大小的的队列; P5 q- s5 X% o$ e3 N! T2 R8 i/ W
_WORKER_THREAD_NUM = 3 #设置线程的个数
0 Q( F( f3 _) Z% \0 ~& C/ ^, A7 k6 i
_Num = 0 #总条数
$ i% ^. J/ Y9 k2 ~2 Cclass MyThread(threading.Thread) :
* O5 {. q2 e2 {( @( d& r! i
' t5 y, L' m- n* p' c, ~: E! u2 j def __init__(self, func,num) :% O# M4 d6 D" G3 m8 _
super(MyThread, self).__init__() #调用父类的构造函数
- C1 S9 G# X; _3 N2 e1 T self.func = func #传入线程函数逻辑
6 P# w0 J& F2 T; R* _ self.thread_num = num
2 Z, h0 X- W" q- J4 s% ] def run(self) :
6 Y7 n! k) @8 W: ~5 l9 _& @ self.func()
! }0 O) V1 a1 V+ z; o/ l" U' w& ]1 D1 ] #print u'线程ID:',self.thread_num
9 u/ Y8 A6 [" h3 Z
5 I' d# K$ X+ E# w2 t! rdef worker() :
- V9 t9 Z5 v" ]3 ^1 B- y global SHARE_Q
/ M. }" ?( V' ~; t0 N+ ^9 B, ~ while not SHARE_Q.empty():- U8 M' D5 G! r+ d) B
url = SHARE_Q.get() #获得任务
+ w) c6 h1 }, K6 g my_page = get_page(url)
4 k! x9 e N% m& s: V' n1 G- T find_data(my_page) #获得当前页面的数据( \) n& A% ~& m% ^: J
#write_into_file(temp_data)
: @; `3 S8 C; N& A time.sleep(1)
7 ]0 P2 t" Y3 M' v K2 `4 D SHARE_Q.task_done()
( I- U" e) h# p( y4 v8 o+ J: R4 u+ ?7 z1 w/ X# ~1 r) F) d
def get_page(url) :
8 y% s5 a% c" p) v: \8 P( |, n y """5 H0 s5 S; M( ^9 }. Q% f6 f
根据所给的url爬取网页HTML
3 n9 d% d) W0 L6 G Args:
0 r% `4 v/ z7 ?5 l url: 表示当前要爬取页面的url- k- W# j) Q1 ~* n# u. i1 T
Returns:
r* G3 h: r& l5 N% u 返回抓取到整个页面的HTML(unicode编码)
, C* r: |& {" f* p! g( r7 Q Raises:
& s* r0 k; K1 p [+ Q( g URLError:url引发的异常+ W# R; N: i6 m6 e
"""& w% R3 }$ A; l6 V0 O* P
try :7 ?) L D3 D+ T/ s
html = urllib2.urlopen(url).read()' ]- _9 h1 M5 Y* o# l
my_page = html.decode("gbk",'ignore')8 w8 P; m2 M3 U5 S
#my_page = unicode(html,'utf-8','ignore').encode('utf-8','ignore')
8 `8 q1 R3 U! @1 N: F #my_page = urllib2.urlopen(url).read().decode("utf8")
% t2 I9 K0 c, N0 r" l except urllib2.URLError, e :/ p0 @" L7 K4 d7 _ F
if hasattr(e, "code"):
7 _; E3 e: U+ Z print "The server couldn't fulfill the request."
1 t) B+ Q/ E8 ?$ r4 { print "Error code: %s" % e.code
) R. }5 @. z- P `+ ~' h, W& E elif hasattr(e, "reason"):) ]9 ?8 x- S% c/ Y
print "We failed to reach a server. Please check your url and read the Reason"" B7 ^+ ]$ @7 J9 ]1 m
print "Reason: %s" % e.reason) U: B- q5 t/ J2 Z! H# e9 w4 \
return my_page# |, g1 k9 k$ ^# @. X
, l3 `" [+ M$ |( Y' ~def find_data(my_page) :# `+ D7 x: Q2 }8 S$ M( _( u K
"""
3 k8 O7 C' X$ W8 p f' x. { 通过返回的整个网页HTML, 正则匹配名称
9 \, @% @. X( _/ T7 g+ }
* |3 d% O5 e8 ` Args:5 v: k% R$ e* T$ y( X: R9 t3 }( \
my_page: 传入页面的HTML文本用于正则匹配% D" F% x8 g4 S9 F: T- n
"""
. ], J& O2 b- e4 c, P7 @( q global _Num
$ J" p; X1 Z, I temp_data = []
6 I5 i) l+ |5 r* P- y& g7 z0 l items = BeautifulSoup(my_page).find_all("div", style="width:96%;margin:10px;border-bottom:1px #CCC dashed;padding-bottom:10px;"), m7 _/ S4 ^( D* `( ~
for index, item in enumerate(items) :1 U0 D, p$ ?, n p* T* b
#print item" S2 `$ l6 P- w( E
#print item.h1: M4 H4 i. c, J7 ]
#print h.group()
4 z( }1 W X% _2 E) C1 ^$ z8 r #temp_data.append(item), l4 [& S9 W- F: {5 c
#print item.find(re.compile("^a"))
' a. U# s8 S3 X' z href = item.find(re.compile("^a"))
+ T# i6 j) f0 G S3 I #soup = BeautifulSoup(item)/ S* f; f7 A8 M& z# v
#公司名称
: `: D; l( M1 ]( O! m) C9 i' X& ` if item.a:" a$ w% x; n. s* `
data = item.a.string.encode("gbk","ignore")
' s6 t0 d1 i" [2 c( a/ e" z print data
( R0 g2 C. ?/ [7 |& J6 T temp_data.append(data)* X- G9 n1 H! ~, K: M- w1 n1 }
0 J2 a0 @! T" V) h: {- d' N
goods = item.find_all("div", style="font-size:12px;")/ `/ s. @* s+ ^; ~
6 y+ S2 o, A& |% P( x( s8 V) Q2 P #经营产品与联系方式
4 f+ l9 o' b; b) T for i in goods:
" e0 y h% K) E6 V+ B9 j) t) W, e data = i.get_text().encode("gbk","ignore")
. m* @% ^; X6 @$ f) E temp_data.append(data)
: z8 G8 U9 r8 b( b0 h' n" Y" }' P print data& H8 ^9 n# c. _" i
#b = item.find_all("b")
- ^" i" N o2 _; @# \; X #print b
% l' [- o5 C+ q2 p2 N5 F #链接地址
4 _6 n& t7 e4 | pat = re.compile(r'href="([^"]*)"')
3 }8 T/ s" a- j/ u) ` h = pat.search(str(item))
I+ x- o. L+ ~% n0 }; T if h:
% I4 u; p/ e! Y" [, ? #print h.group(0)/ V' N- B' e1 e- P& i `1 n
href = h.group(1)
# e. |" P9 ]0 l6 ?( W/ q print href. s) d8 s+ h* S
temp_data.append(h.group(1))" Z. p0 g! c5 T$ Q U$ L5 |! N
, @1 w% W5 d6 W3 Z- X! x7 X
_Num += 1
% e- `: b4 J' j7 k #b = item.find_all(text=re.compile("Dormouse"))
; b1 ~/ P( C- Q4 i, Y$ Y #pprint(goods)
( y) r1 b, E3 S% | c# n #print href
7 m( Y; L2 ^: V1 @ #pat = re.compile(r'title="([^"]*)"')
# ~3 U& X, B. [% y. T #h = pat.search(str(href))# p9 w0 v. w6 A0 l
#if h:
5 R& L# ?, F$ Q2 a #print h.group(1)) U# a8 z# d3 t3 P
#temp_data.append(h.group(1))( r Z: v% i9 S6 p) W' M; |
_DATA.append(temp_data)
! |) R+ v& l8 v& N# m* X2 e9 K3 G; V+ F! n9 G/ D' H% S/ L# y
#headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)
$ w0 m# o5 ?. F! B: ^6 c- Z+ D#all_url = 'http://www.mzitu.com/all' ##开始的URL地址, q9 P7 H2 H+ i- g0 C
#start_html = requests.get(all_url, headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释6 p% C C& I4 t8 E" X9 h" v
#print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text)# h$ \2 e/ H0 f b# H1 I
0 \1 k$ p/ v6 E; R& odef main() :
3 [2 L: q5 k Z6 {0 e* ` @' u global SHARE_Q* D* e2 d( q9 k) o
threads = [], V/ R; X) t6 A7 Y
start = time.clock()
t% j9 i& c$ U/ G douban_url = "http://company.yktworld.com/comapny_search.asp?page={page}" R5 K+ K6 F1 }/ z
#向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务7 d7 J! b' s4 N' M
for index in xrange(20) : : H- p( X' [/ A4 ?- D2 U+ j
SHARE_Q.put(douban_url.format(page = index * 1))
% Q9 S$ G; b* q8 M& o for i in xrange(_WORKER_THREAD_NUM) :
1 k5 h- W! }5 k& I: R8 }: { thread = MyThread(worker,i)& y# A3 G2 M/ k- `
thread.start() #线程开始处理任务- e4 q% H* m1 `! F6 @# k
: U1 }, U3 w6 e# p. a. ~! u' \
threads.append(thread)7 S9 U, u( K: T* {
for thread in threads :
- Q. m3 @: r4 m Y; ?+ Q% B0 B thread.join()- @ e! n, a& J/ ~/ m" e3 I
SHARE_Q.join() l, C; h) U4 G- X
i = 0
2 e7 {2 B$ E; _6 M/ a9 E. {% a1 W with open("down.txt", "w+") as my_file :4 o) c; F i/ g, Z. L0 U- K5 a$ j
for page in _DATA :
; l9 m" ]- ~" v L i += 1. i) |6 s8 ?' j6 _5 k* Z; T
for name in page:
7 W( l+ E9 b: q2 Z% H2 [ my_file.write(name + "\n")6 W. ^! e. ]! i
6 x& `# h7 e; C( Z2 m5 K2 k print "Spider Successful!!!"; |& v) ?9 z5 i* q$ n7 D
end = time.clock()5 |) r" l* ?7 ^, i9 X0 p3 \
print u'抓取完成!'
" t7 B( [% i6 @& m2 k( g print u'总页数:',i
% S# H% c5 {$ J: [ print u'总条数:',_Num4 P( K4 I0 A, Z' Y4 n
print u'一共用时:',end-start,u'秒'
; I: A& x- _! o' Y: ?4 K, W1 u7 B: j
if __name__ == '__main__':
) J' \% k, g }; ~# g! [3 e0 P main()( ^2 A% }( a& s* Y" ~4 v
: n4 L' N {$ n+ q$ p: a1 r
& ]- \7 B5 m) w2 W: s
|
zan
|