- 在线时间
- 479 小时
- 最后登录
- 2026-4-17
- 注册时间
- 2023-7-11
- 听众数
- 4
- 收听数
- 0
- 能力
- 0 分
- 体力
- 7790 点
- 威望
- 0 点
- 阅读权限
- 255
- 积分
- 2923
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 1171
- 主题
- 1186
- 精华
- 0
- 分享
- 0
- 好友
- 1
该用户从未签到
 |
#!/usr/bin/env python
) _# `) x. @/ E. J7 G& [# -*- coding:utf-8 -*-
! T5 L8 [% h2 x$ l"""
2 u' r* Y3 W( p3 I; h% VPython爬虫,抓取一卡通相关企业信息
( B8 C1 m3 U7 N0 U; O5 P$ ZAnthor: yangyongzhen
- m9 A$ z/ s! X/ T$ m3 kVersion: 0.0.2
* [- g \# j) }Date: 2014-12-14; h) q% i( N3 r. {- ^0 d) d7 W
Language: Python2.7.5
2 o; t5 V1 b. M& R8 ^" u6 UEditor: Sublime Text21 y; A# I0 h) g! ~
"""
* H; p+ c. E, _5 d: k( ~; K2 a( O2 F5 x. j6 {
import urllib2, re, string
1 d% u# B! C6 U' z- z' Z7 { Yimport threading, Queue, time9 i, f5 v2 \- d/ i+ p* @
import sys
8 T- `0 ]. U, B8 G7 yimport os
& B& A1 l* w, n8 j% o$ e$ sfrom bs4 import BeautifulSoup
! S2 ^5 [: c' ~5 l7 R8 n#from pprint import pprint2 D; v7 ^- p- f! ]: r& t4 S
3 I. u w# I4 u8 L' w7 d/ breload(sys)8 p4 T* T" \& X+ @
sys.setdefaultencoding('utf8')
# Z7 x( B0 {6 {2 ?2 __DATA = []
- o9 o) i8 H! A2 `+ X, ?/ i9 e. UFILE_LOCK = threading.Lock()- g1 K* J4 c9 s- U. [' x
SHARE_Q = Queue.Queue() #构造一个不限制大小的的队列
' g/ E. X& }0 z/ c7 }_WORKER_THREAD_NUM = 3 #设置线程的个数
% b' N# D) P) N3 t% L" [$ ]: C0 L: U
_Num = 0 #总条数
7 _* t/ } `+ c6 y U M! {& O- kclass MyThread(threading.Thread) :
" i+ \9 i" A* T. n$ W% r. {+ Q! [5 d1 j( y" J
def __init__(self, func,num) :
' X8 }# }6 C' C4 w& j: o& @- B super(MyThread, self).__init__() #调用父类的构造函数/ T/ m% r& I! j$ h
self.func = func #传入线程函数逻辑
! k6 q. I* Z x5 P, a% ^ self.thread_num = num " [, \7 x# |! j* B
def run(self) :
- C0 \( y+ @9 v. U) R self.func()/ O; y+ C: f/ K8 O) [4 k* E1 z! {1 k
#print u'线程ID:',self.thread_num
$ J4 V e8 `. V. F# t: s* b/ X8 H4 P1 [
def worker() :4 G$ l- w, A* v* ^ X$ B
global SHARE_Q6 ]8 w/ }3 o; n
while not SHARE_Q.empty():+ o( Y1 B, M' V# {. m/ G
url = SHARE_Q.get() #获得任务 N- `* D5 D$ k- C" {4 d" |
my_page = get_page(url)1 U) W1 v/ ~5 o/ {7 K9 W3 e9 P) \- z
find_data(my_page) #获得当前页面的数据
; d7 O5 G% U1 P, r- j #write_into_file(temp_data)
$ C0 G; _, @2 n+ r time.sleep(1)+ x+ D8 R/ ^6 N4 J
SHARE_Q.task_done()
9 s2 G, t6 z- e7 e1 s
& O/ ^5 j4 M. K p a2 j4 Ldef get_page(url) :
- W3 D; o. }) Q0 L' s, \, \1 p """+ T1 K6 y# J3 l" Y: V
根据所给的url爬取网页HTML2 x6 t1 e$ Y9 k0 i; f9 O
Args: 8 H) Y# F3 ^: F# Y3 ]
url: 表示当前要爬取页面的url
! Z; _) q' A3 q4 q: k1 i* m1 Q5 U Returns:1 y7 B" C- E; j2 G. x& B' k
返回抓取到整个页面的HTML(unicode编码)* W# {9 E5 o+ Y6 L* ?
Raises:
$ o9 K o4 L, l6 O URLError:url引发的异常+ w C! _+ _5 m7 C ?6 L0 S
"""
# B5 l% N# l9 N X9 K t try :6 [6 y ^5 q6 L
html = urllib2.urlopen(url).read()
5 N y$ h) h9 q my_page = html.decode("gbk",'ignore')
0 H( L A& R& H- Z #my_page = unicode(html,'utf-8','ignore').encode('utf-8','ignore')
' g& C. Z; |$ q3 T5 ~- Q, Z #my_page = urllib2.urlopen(url).read().decode("utf8")$ \0 T1 h8 E0 l" B
except urllib2.URLError, e :$ ~& \ Z/ m3 m9 n8 z4 }' V
if hasattr(e, "code"):
+ t$ h# n( ?) Q% N7 y4 H( U1 y print "The server couldn't fulfill the request."' E3 o' o2 a; ?
print "Error code: %s" % e.code
4 j& v" @" I& h3 `) S0 Z elif hasattr(e, "reason"):
6 E1 A! f$ T% c- ~4 L( J; x print "We failed to reach a server. Please check your url and read the Reason", Y( s* n" I/ r9 P; \* P3 S
print "Reason: %s" % e.reason, }0 @0 E" A# C% ^- Y0 P$ F
return my_page$ d! D" G) U, i# |" L" R$ x: ~
0 X2 c% h" u% ~3 ^! x8 W7 Z* {! E
def find_data(my_page) :" d6 V$ h9 g% @, Y7 `& {
"""
% p7 S: h8 r+ d 通过返回的整个网页HTML, 正则匹配名称
9 Q8 g2 ?6 l' m0 ~6 X+ a
$ G8 }, Q. U4 J& N9 j0 A Args: c7 w6 X& F) H* f( `8 H
my_page: 传入页面的HTML文本用于正则匹配
" C* {; q. c) Y; s7 \, N9 n """
& |0 g; X! {( {2 |& Z+ j( L; o global _Num; C% `0 v0 |5 w. S( g
temp_data = []; R2 M, X3 a( i3 Z! b
items = BeautifulSoup(my_page).find_all("div", style="width:96%;margin:10px;border-bottom:1px #CCC dashed;padding-bottom:10px;")
9 ?4 a; z! v: G; W$ T) C8 T9 y for index, item in enumerate(items) :& j! m4 s1 A8 T& ]
#print item
) e' [6 k; q% }" x/ w! p; U O7 Y #print item.h1 E3 c+ x4 T0 H8 Y
#print h.group()
* ^" I" P7 z3 {5 s5 k* e& q$ m #temp_data.append(item)! Q$ x. k4 V7 H" {4 E6 d' N
#print item.find(re.compile("^a"))
5 m6 ]) x& t: Z href = item.find(re.compile("^a"))1 v* v, Q7 x+ ~' _; E
#soup = BeautifulSoup(item)0 h8 _+ G4 ^) X) M9 \/ ~& [
#公司名称: w' ]1 H3 ~. M
if item.a:
6 B. ]8 p) v1 [% N7 t1 B; s data = item.a.string.encode("gbk","ignore")1 E$ E' ^8 M$ e3 A/ a0 @$ T% ^
print data
# [! z) [ x% |4 l' O. B( V7 b. y temp_data.append(data)
8 h4 Q3 O/ S, L$ l7 q% {
" @7 T8 R& U: L- x8 o. ~. { goods = item.find_all("div", style="font-size:12px;")
) p: _8 b9 a- e 9 Z# |' v% C. r% E9 n0 C# X
#经营产品与联系方式
! e: v4 D5 P) U, l* ?7 I for i in goods:4 { n% d, g& u& _
data = i.get_text().encode("gbk","ignore")
, u& n; S/ J9 H/ x. U temp_data.append(data)3 l: ~! ]6 \6 N' _
print data/ E6 D% d" z3 `1 P8 T2 f) ]
#b = item.find_all("b")
5 }/ q+ P8 R) c% s# |% Y% t4 g #print b% o T' B9 f1 ^
#链接地址
+ M& |4 P- r$ x% X5 ^! p pat = re.compile(r'href="([^"]*)"'); n9 k3 r3 Z/ f9 |- }
h = pat.search(str(item))( h5 A6 P, P8 T K) a
if h:& x* a) F/ f/ Q9 i) _
#print h.group(0)
+ d9 m: w+ j' i: j href = h.group(1)
% Y$ d, g8 T: s' S# r/ h* [* f print href
2 K1 G; \1 |. Q temp_data.append(h.group(1))- M9 a q2 v7 S1 L+ Z; D& Y
M7 o6 F; ]* q, L4 B1 E% l( R
_Num += 1: v: z6 m- M0 E" T7 Q) |5 D
#b = item.find_all(text=re.compile("Dormouse"))9 S; W$ I1 g* a/ P' D, e4 B
#pprint(goods)
0 b1 x0 U$ a- o- H9 ^+ x' U #print href
- [8 P* C8 ]% K' E* ?. U #pat = re.compile(r'title="([^"]*)"')
* l& L# n, N% ~+ ?! g3 p# H #h = pat.search(str(href))5 ?4 } H: |5 [3 E6 X
#if h:% t( N9 Z9 I1 ~/ P9 N( r. y/ W; u
#print h.group(1)
& s% f7 e% @9 P, T& T& S8 M: o #temp_data.append(h.group(1))
* c9 u" W, V- z A _DATA.append(temp_data)
+ |( B5 V" H$ x5 ~2 h% _2 n" ~
% H+ ~( I; {: y4 K9 G1 Q. k#headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)- G9 |9 c3 u( P
#all_url = 'http://www.mzitu.com/all' ##开始的URL地址 G* g' D$ S. {" G0 f9 p
#start_html = requests.get(all_url, headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释) x$ S' u$ j: c5 r
#print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text)
5 q8 P; o, I0 F/ ~5 _3 P e" o) w ^/ |0 q+ _* P2 q
def main() :
. Z0 A& f$ x" a9 B4 l& p5 W global SHARE_Q# ^+ u5 Z ?0 b0 P+ N1 ?+ J# G# E! u
threads = []
0 X) }3 F, y# N2 S! p- [" v; c start = time.clock()4 {0 Q6 V* S G. B% |, {
douban_url = "http://company.yktworld.com/comapny_search.asp?page={page}"
$ @( v- ?6 D6 C #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务
w! B# J5 d4 H5 X for index in xrange(20) :
9 `$ h1 N+ ?) _* D1 n7 \ SHARE_Q.put(douban_url.format(page = index * 1))- E/ O' H% q( |# d
for i in xrange(_WORKER_THREAD_NUM) :
z$ @# T# g. X# S thread = MyThread(worker,i); g8 K' _! u$ `" C
thread.start() #线程开始处理任务
5 D2 X* m/ V2 F$ L3 O* w, b: S& H* }; W2 k
threads.append(thread)( d: a: \1 A8 c+ }
for thread in threads :& A! O8 E5 M) Y) |4 Z+ Z/ X3 j
thread.join()
5 a b( d9 r5 W, n6 }9 Z7 ~; V: C SHARE_Q.join()' l; M B) Z( a, S! M
i = 0! ?. C3 }& y H1 B0 C$ Q. P1 i
with open("down.txt", "w+") as my_file :( R" `/ X0 d6 \0 O* x
for page in _DATA :
) |! Y, t1 A( V$ B i += 1; c. f3 @( n4 v! O9 \
for name in page:
' S" u( ]) n: S% J my_file.write(name + "\n")2 h& G4 P* g) ^! _' w
/ h' T6 v: x$ {- q6 M: D print "Spider Successful!!!". ]) s1 t K( u! a: U
end = time.clock()
& F% y7 @ R/ z. X4 D print u'抓取完成!'
. J2 `. Z0 ?) t+ b& D2 g/ y9 w print u'总页数:',i0 s. p$ |' R) M' v
print u'总条数:',_Num) j+ J! K' w0 G: e9 g
print u'一共用时:',end-start,u'秒'
3 _, g* n( Y) M- _& B% p! ?1 {% P% \* R% L9 _2 {2 l1 P
if __name__ == '__main__':
( p8 C/ X# d$ S9 ] main()- Z. }- y( l: w8 j* {6 D
! t& C, s r; U% v' f
- M$ b+ H) a3 F! o |
zan
|