- 在线时间
- 471 小时
- 最后登录
- 2025-8-26
- 注册时间
- 2023-7-11
- 听众数
- 4
- 收听数
- 0
- 能力
- 0 分
- 体力
- 7656 点
- 威望
- 0 点
- 阅读权限
- 255
- 积分
- 2877
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 1160
- 主题
- 1175
- 精华
- 0
- 分享
- 0
- 好友
- 1
该用户从未签到
 |
#!/usr/bin/env python6 O/ C3 F" H4 A1 j
# -*- coding:utf-8 -*-
( b5 m# ]1 ~. i; }* {7 \"""
- m. G2 q# _+ r- fPython爬虫,抓取一卡通相关企业信息1 H* Y) [6 N- C- L) F U7 N
Anthor: yangyongzhen
$ I0 P7 w& P$ e+ w0 Q# H" ]Version: 0.0.20 S% S2 @0 o7 D; q- c, V, O
Date: 2014-12-14! O0 A0 f! {- z8 a8 m$ r. Y, v" ^
Language: Python2.7.5
9 x0 K+ b' {0 g- |$ z5 GEditor: Sublime Text2' y2 E$ [6 S" s) V7 }1 R- i
"""
/ N* F3 o# w$ l& _; `$ f
) P/ K8 e/ R! G! v. z( \7 c" bimport urllib2, re, string. B# s3 A1 B% L' U; H( c! Y
import threading, Queue, time
# R+ g1 s6 w Oimport sys
! F$ e0 Q* d7 \( Zimport os( s8 i$ C, r3 }) \
from bs4 import BeautifulSoup! w: X8 E/ ~" P. t
#from pprint import pprint \# |: D" w/ `$ s
$ [7 y" v6 |9 D( B7 I; t# |reload(sys)
& C+ M5 s0 a; Q. h5 g0 W0 r: \sys.setdefaultencoding('utf8')
3 Y: K$ o3 P: z& V_DATA = []
|' ^+ D* K% \/ VFILE_LOCK = threading.Lock()
# b+ l+ t8 L; P5 E" |: K# M4 pSHARE_Q = Queue.Queue() #构造一个不限制大小的的队列
7 d/ z% I, L9 c6 l" e_WORKER_THREAD_NUM = 3 #设置线程的个数1 ]' R9 c. S, W, n; H7 g
8 N0 ]; K% M% v3 J$ S6 s
_Num = 0 #总条数5 i/ p" B# ?3 g- ?
class MyThread(threading.Thread) :7 y5 q+ N" X& N9 a$ t) u4 d
" h0 X, i; B' S
def __init__(self, func,num) :1 q4 }4 t: W& U6 J" M( H. C" L
super(MyThread, self).__init__() #调用父类的构造函数
# _! k0 V' m w9 G9 C self.func = func #传入线程函数逻辑; @" }6 S! X7 |+ f2 L# H
self.thread_num = num % M: k) \$ G7 ] ~- a$ m
def run(self) :
, w" g0 C* ?" F% A self.func()
% |8 d- G* Z) u3 f2 J: f/ r #print u'线程ID:',self.thread_num3 H7 f9 Z- m% {, @" C& a2 U1 R2 |4 r
4 l3 {; Z% Z; K$ {def worker() :
9 Q' m9 j2 [+ K, \. \ global SHARE_Q. A' z2 B% v/ V' s
while not SHARE_Q.empty():
' N) Q5 t% |( _ U url = SHARE_Q.get() #获得任务1 V F7 {) B( J# r8 u5 X. F4 I3 U, @
my_page = get_page(url)4 A N" X& I9 f s0 w. O9 ~# B9 o
find_data(my_page) #获得当前页面的数据
$ x9 I4 x" f2 M! b #write_into_file(temp_data)
1 @) o3 L2 W# Q. ^0 w3 _* C time.sleep(1)9 q) M) y/ C$ o6 {: \# }- `" O
SHARE_Q.task_done()' _1 J* J3 S4 t4 _# t% v% j
$ b; a; K' `: R, H/ kdef get_page(url) :
. f% M m+ i7 t# n6 {2 Y% ~0 C5 V """
3 }5 h2 U* I% `5 g$ E/ y 根据所给的url爬取网页HTML
9 n7 {! ]; M2 q Args: 1 l! _ r: ] q4 y1 }: [
url: 表示当前要爬取页面的url
, g7 R h& t" T, _; n# ~% Y8 D Returns:5 C' E v( j2 ]7 G0 @
返回抓取到整个页面的HTML(unicode编码)
. F8 y- M1 G1 I* I4 \( B Raises:# p4 X, P: ?9 O" e4 d
URLError:url引发的异常4 t4 G( w8 E5 W" O& M
"""
+ F/ V1 P: r6 s! \4 F$ F try :
3 T l4 d# I& }. W% B3 B, K0 I html = urllib2.urlopen(url).read()
h7 i9 Z6 d3 ?$ q0 g7 \ my_page = html.decode("gbk",'ignore'): I/ ]9 _) `" |
#my_page = unicode(html,'utf-8','ignore').encode('utf-8','ignore')% K8 c* q/ A! K' s2 W
#my_page = urllib2.urlopen(url).read().decode("utf8")
& }$ w0 t! X# i! ^% y except urllib2.URLError, e :* E1 P; \7 S( \' s S
if hasattr(e, "code"):
7 N& W$ Q( M3 X$ U( h; p+ }. ~ print "The server couldn't fulfill the request."2 l) I* ~% Z, {" w! D* V, n, G
print "Error code: %s" % e.code- M1 I) B: x& s" [8 L6 B
elif hasattr(e, "reason"):
9 z+ r6 }# y+ o3 V X print "We failed to reach a server. Please check your url and read the Reason"
+ W8 t6 q0 o( L. P/ h print "Reason: %s" % e.reason
& l2 \7 N; P3 P9 f, ~ return my_page
" f7 @6 T3 S( r2 A2 S, ?) \4 F
" ?) b6 T' t* P: F8 q6 T. b% bdef find_data(my_page) :
0 g7 x, ~6 J: O6 q0 ?* G$ r """
+ R7 X7 r: D. k0 D1 `9 G5 P 通过返回的整个网页HTML, 正则匹配名称
+ F8 i ~& s! O- \, \
: D5 ~* F) z* z, [ Args:
: x7 O3 C- x ~1 r4 D% s$ r my_page: 传入页面的HTML文本用于正则匹配
: _; s H; Y' e """5 I5 d# r' ?5 u
global _Num4 P2 d7 O* b) ~) |8 F
temp_data = []4 d) N. u* W8 a8 n- g/ e
items = BeautifulSoup(my_page).find_all("div", style="width:96%;margin:10px;border-bottom:1px #CCC dashed;padding-bottom:10px;")
+ e/ M+ p* ^8 o e# K6 P for index, item in enumerate(items) :
1 A' B! A; r6 R! m5 ?5 E #print item0 G/ F! _4 N8 z( Z! t" J
#print item.h1
7 ]! P% C* D7 ?0 D #print h.group()
: v Q6 F7 J$ p2 |9 k1 G #temp_data.append(item)
9 _( o8 S6 c; \5 Q #print item.find(re.compile("^a"))
0 \& z1 m7 I: E& U, r5 P href = item.find(re.compile("^a"))
4 d% C( p; M! Q9 o/ [ #soup = BeautifulSoup(item)
) T j" Z4 K7 g& [. R! B; Y #公司名称
4 D6 h' Y9 e. [, M2 c if item.a:' S; R/ ^4 o3 B5 Q% A* g! t& u
data = item.a.string.encode("gbk","ignore")1 V3 A7 c& F4 a. E& j
print data
! F, L9 Y: q$ S: r# c* C temp_data.append(data)
) i6 d& u& y* z* a* P! z% @$ |% \# T" G. i# ^1 u) C9 H2 ~
goods = item.find_all("div", style="font-size:12px;")4 @' N: h: V" ]# v" ~7 L
) q+ v' f; j' G
#经营产品与联系方式
V) K- K+ T$ m$ ]6 ?( U for i in goods:4 n, ?" X4 s6 l' m
data = i.get_text().encode("gbk","ignore"), @+ r2 f1 X b) X0 i( ~9 M
temp_data.append(data)
, r3 s, K* S) y; q print data1 d6 b$ ?( Q" b! S! ^7 I
#b = item.find_all("b")$ `( ` n9 c. U( c, Y$ d( B
#print b
0 L+ S6 i9 I) \, M: _7 u2 R #链接地址
! l) Z* w1 N' B! L pat = re.compile(r'href="([^"]*)"')* i4 r" s4 ^) S- ?3 M
h = pat.search(str(item))
1 Z" S/ p: G( I if h:: [/ D4 o6 M0 g8 Y2 W) s# q; X- J7 p
#print h.group(0)
8 M$ @8 V: X1 ?0 P1 b& S0 A href = h.group(1)
) V1 Q) o: ]" V& D print href
8 k! k7 o3 L. I# Q- w3 E: m$ Q temp_data.append(h.group(1))) x' \0 b6 ^- g* y- x
- }6 d) m* l! u( |" a) Y. h _Num += 1
; @1 |6 \1 w: A) D, C8 N$ e #b = item.find_all(text=re.compile("Dormouse"))
! \4 c5 [1 t/ I6 \ #pprint(goods); k( |7 r( Y, f
#print href) V* f8 g$ [( C+ y* q9 ]. \& [
#pat = re.compile(r'title="([^"]*)"')
1 Y% k; Z$ p5 ^ #h = pat.search(str(href))- y/ T" b8 u, N
#if h:3 M0 t- J/ i( B" ^: E% R2 d
#print h.group(1)
* N: ~9 z8 T$ ~. J+ W7 t. p% ~3 l #temp_data.append(h.group(1))
4 V; [) I( Z% v7 Z _DATA.append(temp_data)
9 u; X h: b7 v6 R4 x) z5 Y3 @1 ]! O/ ~, i5 W0 j
#headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)
2 } H" d" x; m5 e* _, }# p#all_url = 'http://www.mzitu.com/all' ##开始的URL地址
2 V1 A9 ^3 Z# o* p$ `#start_html = requests.get(all_url, headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释2 y) V, ^/ {* r' z1 s9 D
#print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text)" v( c- `# | {
" E. ~# A' _! o! g1 H6 z
def main() :8 X1 B$ h4 W7 l& G4 A
global SHARE_Q( a/ ^0 O" ~! w/ [9 ~
threads = []
, ~/ _/ ~3 E: [4 o/ G# H1 r start = time.clock()
. b/ n1 s' W! c. \/ ]; C douban_url = "http://company.yktworld.com/comapny_search.asp?page={page}"$ l& N4 m, j# e/ Z+ T5 d/ _+ _
#向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务/ q9 r. T7 W4 x& e
for index in xrange(20) :
6 L0 }$ t- u) ?7 I2 n' o SHARE_Q.put(douban_url.format(page = index * 1))
0 W g. t/ b3 l" f0 ^& h for i in xrange(_WORKER_THREAD_NUM) :
. N! }6 V3 i E9 l8 b thread = MyThread(worker,i)* y0 }. u4 h4 z; o$ Q h
thread.start() #线程开始处理任务
% P8 V( l+ Q# ~5 G9 C; ` Y
6 \8 ?9 A) x( I, e: P8 ~9 \2 J- A& X9 \ threads.append(thread)
9 Q5 G! W Q: T. Z. u$ I4 O for thread in threads :
( |" {8 v) y" A7 _ thread.join()% D2 Z" A: ~5 Y5 k! [' g
SHARE_Q.join()
% P0 l! Y5 ^) _/ `2 f i = 0
; `" ^- s7 r# T3 H9 L Q+ v9 \/ ~- k with open("down.txt", "w+") as my_file :$ D* T, D7 t5 k
for page in _DATA :
: w }3 R% z1 n1 e0 P% s i += 1" F9 X2 Y& f# t
for name in page: t' [; E z9 [, G$ b5 }
my_file.write(name + "\n")' N% k+ V3 z4 l6 U5 B1 \
6 Q9 w; ~- N' V1 p print "Spider Successful!!!"
' f9 h1 Y- H" Y d end = time.clock(): @& u$ O+ n; A3 i1 w
print u'抓取完成!'" P I' s0 q# a2 a! V
print u'总页数:',i
. i; b0 H$ i2 p. d7 k3 t* C( \ print u'总条数:',_Num+ O3 T* u% G* c
print u'一共用时:',end-start,u'秒'( b; c' W& L0 z
d* g$ b" q7 [
if __name__ == '__main__':5 V4 S# J! h3 _, s8 i A; W) J
main() [+ |6 B) l S0 N
+ Z ]' H. E! ?. L; U0 k: N. [) A, ?; p; ?5 ~! K) A9 W0 l x1 L
|
zan
|