- 在线时间
- 472 小时
- 最后登录
- 2025-9-5
- 注册时间
- 2023-7-11
- 听众数
- 4
- 收听数
- 0
- 能力
- 0 分
- 体力
- 7679 点
- 威望
- 0 点
- 阅读权限
- 255
- 积分
- 2884
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 1161
- 主题
- 1176
- 精华
- 0
- 分享
- 0
- 好友
- 1
该用户从未签到
 |
#!/usr/bin/env python
$ [1 E0 Z: @) o4 T5 z# -*- coding:utf-8 -*-' K2 h) ~- a2 }& O$ D
"""
# X3 c+ p7 w4 f5 aPython爬虫,抓取一卡通相关企业信息. m+ x# @4 [5 i3 @+ l
Anthor: yangyongzhen8 Z9 n0 n. n9 ?- R4 ] z' W, D
Version: 0.0.2
8 D) ^1 z2 E; c" T( T) HDate: 2014-12-14
* V0 o+ l) p. @! g) YLanguage: Python2.7.52 r; p! l7 x! O' z% x- h
Editor: Sublime Text2- Q, s# o7 e! t9 q# Z4 f
"""( t' M+ X. q: h$ W# s8 L! T" E0 d5 A
2 Q, S* ?/ K0 D" v% A! fimport urllib2, re, string) k& c( E* E8 r: T) n! {- L$ L
import threading, Queue, time
8 b% D" x& c# W# Y# G# rimport sys7 F( d9 T. `( N: c: F# D3 ^6 o
import os5 t: n6 W; W+ @- d
from bs4 import BeautifulSoup4 z! i+ A: h K7 O2 b1 `
#from pprint import pprint3 Z! w3 q2 J5 m' R& Q S
1 f3 ]2 j& u0 T- V- u
reload(sys)
" K( m1 V: d6 h9 dsys.setdefaultencoding('utf8')' l( w$ W" R6 l" b
_DATA = []1 q2 {# s0 C% h$ A @0 S
FILE_LOCK = threading.Lock()% f! G8 F i Y- ]( r p
SHARE_Q = Queue.Queue() #构造一个不限制大小的的队列
5 _) e2 D7 L/ S- a+ a1 S_WORKER_THREAD_NUM = 3 #设置线程的个数" A% j( ~3 s) r6 U! _$ c
$ T( z* ^. E9 H( I_Num = 0 #总条数
2 O3 y9 Y0 O$ `class MyThread(threading.Thread) :
( _7 G; l' `$ @2 N
. e' t' {1 e5 r' U9 E: D+ @ def __init__(self, func,num) :
: v. w: T# \& ~$ x3 b6 T3 { super(MyThread, self).__init__() #调用父类的构造函数
2 q8 m' m7 M9 S8 T self.func = func #传入线程函数逻辑1 Q) O+ O* q4 V' S5 `
self.thread_num = num
: l3 D: h7 b9 T2 T, G* R( H! _ def run(self) :
8 U3 N# Y0 U0 f5 S" B self.func()+ P( [- L" r+ X. f) |
#print u'线程ID:',self.thread_num3 e9 x9 a$ u! |, ?" F
! M/ h* |$ f: a" O7 v
def worker() :, y8 A; Y" l* r+ y
global SHARE_Q* ?9 m% Y4 E! S @9 L3 q
while not SHARE_Q.empty():$ x" e( o5 U8 L1 N$ M
url = SHARE_Q.get() #获得任务3 m5 i) R3 E- F2 l" o$ I3 z
my_page = get_page(url)7 v: W: A B. ]1 y: X5 ]
find_data(my_page) #获得当前页面的数据0 V% }- w( Z4 I' F
#write_into_file(temp_data)
4 _2 P2 q' z) } time.sleep(1)# b/ w6 Z) r8 y4 G+ z
SHARE_Q.task_done()
; t4 j( e# ^& x% U
6 X! U- \, E% j* @. `9 f* V) G: \8 Idef get_page(url) :" F* { s3 [, e3 ]7 \
"""
6 A8 d6 c2 Z( ?) } 根据所给的url爬取网页HTML
& i; j3 O$ ^& l& u! |: ]) ?+ e Args:
0 L1 ]' s6 H- K E3 T url: 表示当前要爬取页面的url
4 T# c6 E/ u9 V# a, t1 D Returns:! w+ @* O9 W- y4 E' E
返回抓取到整个页面的HTML(unicode编码)
3 @9 Y% P6 `# l2 r: q- r$ u Raises:
7 i- i: E2 r! B5 t% u; _8 Q0 A URLError:url引发的异常
) a- _' ?' b$ @0 J """5 O. [, L9 G) W0 s% k3 y
try :
% u2 A0 k5 \5 e# N& x; u! U- p2 x html = urllib2.urlopen(url).read()1 V4 K! c/ Q- [
my_page = html.decode("gbk",'ignore')- ]" u6 m, I, \( R' S
#my_page = unicode(html,'utf-8','ignore').encode('utf-8','ignore')7 P# U' g) p2 @( q* R: j! Y" Q
#my_page = urllib2.urlopen(url).read().decode("utf8")7 x M8 g) S; v& O9 B
except urllib2.URLError, e :
! Y4 }+ T& `5 {) y- C! Z if hasattr(e, "code"):
" C7 X ] O7 }# D0 i2 { print "The server couldn't fulfill the request."
9 O" _# A6 O9 r6 v# q5 p. l print "Error code: %s" % e.code
3 Y: N* E. U2 G& y! x# h( y+ v elif hasattr(e, "reason"):% F! H: h2 B1 l6 J! A' r
print "We failed to reach a server. Please check your url and read the Reason"
" A+ f2 p a! n6 X& c0 } print "Reason: %s" % e.reason
6 p9 B) e7 {- L# H' U7 J. i" j return my_page: x5 I- ~8 x6 t& ~- U$ V! }" X
& d6 x3 d! y- w, G6 l/ g6 {1 w
def find_data(my_page) :$ n( p1 u2 ~+ D4 ^/ d
"""
9 G- k7 Q# O. H( F% H% g( w( p$ ~6 s 通过返回的整个网页HTML, 正则匹配名称 }: D& z# [, Z. y2 |! | K! W
7 i" ^: y9 o9 m0 b+ \ p3 W6 t! K Args:
/ ]) ^7 ]1 U* h- g: R/ m my_page: 传入页面的HTML文本用于正则匹配
0 _9 u8 Q) W+ g3 M! {+ Z9 j """( S6 k/ {6 k& z1 x
global _Num0 l! L' w P1 ^$ L2 `
temp_data = [] v" \3 x0 \+ I
items = BeautifulSoup(my_page).find_all("div", style="width:96%;margin:10px;border-bottom:1px #CCC dashed;padding-bottom:10px;")
8 J2 f$ @% J# J# q6 k for index, item in enumerate(items) :4 C; Y$ i4 c$ z4 ]% O6 ?( E
#print item7 c6 P. `5 R0 i! C3 @3 p
#print item.h1% t5 n, k: \: v, D
#print h.group()5 B6 p, Q' U$ c9 E
#temp_data.append(item)! y% I P* f6 C! y. a2 q0 y
#print item.find(re.compile("^a"))
|6 Q0 Y0 U# x3 r href = item.find(re.compile("^a"))
4 m o* H5 S3 O; ^& m1 G- s- a. G8 R) | #soup = BeautifulSoup(item)2 u1 x: q# W$ g1 j. K
#公司名称
1 z9 t: l+ f. K T' I% z7 } if item.a:
' k0 G; l$ w4 D+ Q8 t data = item.a.string.encode("gbk","ignore")
# j4 K6 ~& d p7 E print data d( Z. ]# Q- O4 p
temp_data.append(data)
2 r: O0 y# o o
* i' h5 e2 o4 ?2 ]" f goods = item.find_all("div", style="font-size:12px;")
1 B2 ^3 E# W9 b! J) ~* B 4 O; i, W8 v9 I
#经营产品与联系方式$ ^' a5 h$ P7 Q, n( a: J' G( Y; d4 C
for i in goods:% W! T8 b+ \8 r/ o6 s1 b
data = i.get_text().encode("gbk","ignore")
8 Z8 ?5 O7 H7 I) c( E7 C( M& i temp_data.append(data)
/ T% z7 a7 V, x) } print data, S% ?- e: K! y
#b = item.find_all("b")
$ H, f- g* l$ K3 j! E) i7 ]" |" d #print b
% W! b% c5 y2 B/ C #链接地址+ N: b0 |; p' K
pat = re.compile(r'href="([^"]*)"')2 J+ B" a9 A) j. _" r+ B* s( `2 K
h = pat.search(str(item))
% j; N/ o" L% g* I- Z if h:
i; M7 }+ l. A5 o #print h.group(0)
4 b) A3 m4 D: L2 J: Y href = h.group(1)
- @7 f8 C% i6 W print href
/ x& @ l* d7 ^, ]+ g temp_data.append(h.group(1))
% O/ t2 [2 J3 k; }$ O3 J% C
0 p3 \. C$ C5 e- E. F _Num += 1! N) H9 @# v+ w8 v7 p
#b = item.find_all(text=re.compile("Dormouse"))
: X" X2 A6 R# Y8 \ #pprint(goods)
; q w* T ^' v; \0 p9 q$ A #print href; T+ J- ]# M0 s! E% s% t
#pat = re.compile(r'title="([^"]*)"')
- r' Y. Z: F* A+ w #h = pat.search(str(href))3 p3 y6 B2 {! } l6 b) d _, ^5 E0 y
#if h:
l$ E6 N1 j; J+ M0 y+ Q( O' k #print h.group(1)( h4 B. G9 x4 ~. Y3 z2 Z
#temp_data.append(h.group(1))& h# c' H, G7 p' j$ c
_DATA.append(temp_data)
( }0 T- x! O$ K% K/ q+ T f. ]8 {; o5 X/ [3 X5 z7 a4 @
#headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)4 z2 l5 o& m1 w' Q# E B, d
#all_url = 'http://www.mzitu.com/all' ##开始的URL地址
2 y: _- g. b3 B#start_html = requests.get(all_url, headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释) {/ a' `; ?/ ^; g3 K6 h, v: X9 l
#print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text)9 E$ s7 H9 M# d G( i
5 i$ j: x; O0 f* `% s7 f2 w0 a) N
def main() :7 L W0 H4 U6 ^# B O
global SHARE_Q# ?. I, a5 ?6 v" p& q
threads = []
4 j# j5 E( V! i8 R start = time.clock()
+ ?' T& \& K" P9 q3 @ douban_url = "http://company.yktworld.com/comapny_search.asp?page={page}"
p! [- H7 w4 S7 q2 {+ R #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务
2 u, n; q( s7 B for index in xrange(20) :
2 ?) u0 g3 v& O& n+ B, n/ r SHARE_Q.put(douban_url.format(page = index * 1))
$ |; t4 ]5 d) c for i in xrange(_WORKER_THREAD_NUM) : E A% F/ O c" ~' c* _2 Q# \3 W$ F
thread = MyThread(worker,i)
' [7 Z& p6 D9 f; q5 \ thread.start() #线程开始处理任务) g0 F O7 b. u. Q' ], K1 @ ~
7 r' A6 v# J' V" c, c
threads.append(thread)
5 [, \( f$ B' P( l$ ^9 Q for thread in threads :
2 y3 a6 N/ L( a3 H thread.join(); v8 _- p" R6 \2 {) H0 G
SHARE_Q.join()
5 ?/ j3 g) a, G: M5 z0 w i = 0
& p: d* V& E: c2 m$ | with open("down.txt", "w+") as my_file :
) {) R' R9 H' v4 z; Z for page in _DATA :
+ m& r0 I: z& R2 K i += 1
5 O. ]- n" I% j+ f( b: M, f; S for name in page:
" n4 v6 |- B: u3 N$ \ my_file.write(name + "\n"): c# n% C1 l- E7 D
; R$ f! s3 B+ x, n( U$ [; p print "Spider Successful!!!"& w- F0 E/ z) k% S# A3 A
end = time.clock()
; I+ |3 ~3 }1 c- L! r print u'抓取完成!'
0 u, q+ f* u( J/ h; T" l print u'总页数:',i$ I4 l7 n8 V/ r' ]. [
print u'总条数:',_Num
4 {4 K9 w5 @9 R; O0 O7 `& T print u'一共用时:',end-start,u'秒'1 {% \. g- o8 [$ m1 B
2 M( q J- u/ a" S2 O( y) G4 S! i6 o
if __name__ == '__main__':6 q0 Q& w) ~& `2 q& M
main()# l; R. x9 Y$ n$ t7 d: u* J9 X Z" J z
1 M% W& r( \" Q' u" v9 b
& P7 ]8 }4 D: Y. m% C5 B1 y |
zan
|