- 在线时间
- 480 小时
- 最后登录
- 2026-6-1
- 注册时间
- 2023-7-11
- 听众数
- 4
- 收听数
- 0
- 能力
- 0 分
- 体力
- 7823 点
- 威望
- 0 点
- 阅读权限
- 255
- 积分
- 2934
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 1174
- 主题
- 1189
- 精华
- 0
- 分享
- 0
- 好友
- 1
该用户从未签到
 |
#!/usr/bin/env python
9 n5 W* q( R+ n8 @# -*- coding:utf-8 -*-1 L' ]4 Q3 @+ O& e8 M
"""
5 g! |& {3 d8 h, w: e; yPython爬虫,抓取一卡通相关企业信息# m. R C3 j# @) V" p
Anthor: yangyongzhen
" q7 Y3 ~$ Z+ R: E' O3 GVersion: 0.0.2
, Q9 y, J3 ^* {. W rDate: 2014-12-14( q; ]: ~: T7 u! L+ \1 P' b& d7 L
Language: Python2.7.5& m8 Q5 l6 Y* o3 I& J
Editor: Sublime Text2
, B+ t. {* |* a ?* \& R: d"""
; N. Z6 T2 O5 s* T K, g0 W, E" z3 B% U+ ]& a1 L9 J5 t, Y% T
import urllib2, re, string2 m$ \# Q0 d7 l8 W) J# e- l; s
import threading, Queue, time
7 H; E; ~4 D: n. L1 n8 Iimport sys
4 B5 `* B$ ^: U: Aimport os' c7 L& l* s- b3 c- X0 @5 b
from bs4 import BeautifulSoup: w: M6 j$ A/ A4 \3 T; Y; G
#from pprint import pprint) G4 |8 O: o) c1 a
; y9 T5 x) \' n1 n0 h
reload(sys)* E: h0 ]: k, r X
sys.setdefaultencoding('utf8')
! J5 d: O6 i0 u7 R_DATA = []
3 \+ p5 B0 J. M% ]4 y* [FILE_LOCK = threading.Lock()
" N; B! P4 x3 P8 bSHARE_Q = Queue.Queue() #构造一个不限制大小的的队列- g8 G" d2 V* [6 v7 F
_WORKER_THREAD_NUM = 3 #设置线程的个数, i& t1 `* G2 T1 R. a. s* L7 ~
6 W( P1 I" R/ \
_Num = 0 #总条数2 A M5 \/ v; C- Z9 L* L4 ^2 R. D
class MyThread(threading.Thread) :
7 y) Z6 k7 y% {5 U/ E+ j, W6 b
" @/ d) |) o( |4 K def __init__(self, func,num) :
0 o/ }4 `- n- v, c! Z* g super(MyThread, self).__init__() #调用父类的构造函数
( o" }( u& F. Z3 A+ q0 x5 ~ self.func = func #传入线程函数逻辑
4 C1 n/ u. N# M# w" C/ Z self.thread_num = num * W0 ]& M, X, m# T& V/ K) d/ M
def run(self) :
' X$ d& U% R4 s9 N: _: O self.func()
8 {' P" E" D8 R. t0 V #print u'线程ID:',self.thread_num* d7 _ ?. u! Z6 o: v1 z0 I
/ G0 P/ v" E8 h9 G' U& d
def worker() :
* t* h- t6 I$ ]/ x) ^5 S" x global SHARE_Q
, }0 B9 V! `& Q( u& e while not SHARE_Q.empty():
# K0 [5 u7 m; [ url = SHARE_Q.get() #获得任务
8 ?( Q+ v6 z% X my_page = get_page(url)
+ n; A, I4 w; `5 H+ C- o; P; Z: T find_data(my_page) #获得当前页面的数据* I7 O& N) h- p- e0 e* P& L- C
#write_into_file(temp_data)
; Y' K7 S* e" W, E. Z; j time.sleep(1)
. p" H! z2 |0 C, u; F5 O SHARE_Q.task_done(): z1 a! T( p9 V& U9 n4 N$ C* C' ]5 r
1 k* I/ v. W) I$ w- @
def get_page(url) :
0 r: g. r+ F( E2 Z' g """
; r) i2 e6 n; N- H+ N1 J9 g 根据所给的url爬取网页HTML
5 `% b* N8 C) R5 f Args:
' V6 P8 T' q& D9 ~$ z url: 表示当前要爬取页面的url
7 T, E# [; n. u B: I& | Returns:
. R7 N& i2 q' L) j4 n$ M% l 返回抓取到整个页面的HTML(unicode编码)
* U9 G1 v9 ~/ c/ m/ M- g/ R# t, s Raises:
! ~, E# m0 U/ Z1 C' O' ^1 k0 S URLError:url引发的异常/ S) f3 f1 R' U4 B
"""2 J1 w, _/ M9 A# v$ D) O
try :) F& J C' |" q6 \
html = urllib2.urlopen(url).read()
% n* n1 s# z/ y4 w' i5 Y$ y7 ` my_page = html.decode("gbk",'ignore')' W! F* V( g1 R% C: Y0 y5 {
#my_page = unicode(html,'utf-8','ignore').encode('utf-8','ignore')9 W ~) m% G6 e n
#my_page = urllib2.urlopen(url).read().decode("utf8")
. H: H# t# l" N/ y* B except urllib2.URLError, e :
' z: r1 k: X) b" N0 } if hasattr(e, "code"):7 j5 c5 X/ E9 d
print "The server couldn't fulfill the request."9 f& p+ z+ y# M: S$ I$ x
print "Error code: %s" % e.code4 M& S. @# R% }6 J& l
elif hasattr(e, "reason"):
) A" ?6 Q! R$ X1 I5 E! I& E9 } print "We failed to reach a server. Please check your url and read the Reason"5 W! D9 L L) Z% p% i
print "Reason: %s" % e.reason
; a0 V0 N: o0 ~2 J return my_page
: x; N" o( B( T B' Q6 L7 |! l$ Z D9 t2 S( `" }% l
def find_data(my_page) :. w7 L& p0 T8 d3 \
"""
" L( O# [/ k. `7 c# W$ P 通过返回的整个网页HTML, 正则匹配名称
( L9 [: k/ U7 v: F' `6 d# C' z- V# H8 [9 c! s/ B
Args:0 G' `- U. s/ D) u; u
my_page: 传入页面的HTML文本用于正则匹配
3 }" {6 R4 o `1 Q$ |- o """
6 P1 Z6 k: P7 I( h global _Num& {6 V% \( p: g, m- k1 k/ E
temp_data = []( K J9 A, |7 v- _. ]6 V
items = BeautifulSoup(my_page).find_all("div", style="width:96%;margin:10px;border-bottom:1px #CCC dashed;padding-bottom:10px;") ?8 M5 i& g$ Q, I8 N! V- f
for index, item in enumerate(items) :
1 S" R$ D' {' Y: Q #print item8 H; Z) f% @0 }. h
#print item.h1
. `6 d# l) [" d! e% C, H: I #print h.group()* U. w7 t( B8 G6 s
#temp_data.append(item) c. Q' t) P8 l0 v0 _8 l6 H3 x6 t
#print item.find(re.compile("^a"))
- \3 D, o, }2 X' Q# h9 V, S href = item.find(re.compile("^a"))
5 [ u( q/ x% X #soup = BeautifulSoup(item)1 o6 t) {1 J2 j: |9 C
#公司名称$ b- K" `6 D# T: G7 v' M
if item.a:
9 r9 N: R' L2 K: J$ j) Q0 i; G data = item.a.string.encode("gbk","ignore")4 U- Z5 P2 t" G4 l$ {' L. f
print data' z/ J3 q6 x8 I4 q- {9 S4 N
temp_data.append(data)# E. H5 h: v1 b" Q5 L
! N# Z, F: \4 ~4 o' Z goods = item.find_all("div", style="font-size:12px;")
* Q: y8 v p2 Q2 u3 R ! G6 x* j9 m- G, @. |: L
#经营产品与联系方式! ~) f( n7 J4 O4 Z* R9 v1 `6 T8 L+ Y
for i in goods:4 V' s+ z; W0 H2 F2 n! c# |3 y
data = i.get_text().encode("gbk","ignore")0 v. d7 O6 {: P. p# m' ?
temp_data.append(data)7 ~+ w; R! R6 h9 b& @+ j' S
print data, p; O' K& p" o" X) V- H
#b = item.find_all("b")" O6 j3 `5 ?" E- ~/ a# [9 O
#print b
# a$ a8 ~, _/ c; X& L #链接地址
# M. U+ w8 p. a5 Q pat = re.compile(r'href="([^"]*)"')9 C3 q' |" _- c; t
h = pat.search(str(item))
/ J$ d2 o& L+ O* V8 { if h:2 ]+ | X7 i6 S: s; s) v
#print h.group(0)
# `8 ~% I2 p7 ~& p4 |' P href = h.group(1)9 w! d6 u) b6 ~# d+ o- E0 V
print href
( B% t1 V. `+ i- I0 M0 { temp_data.append(h.group(1))5 I7 D5 p$ `( B! i: D+ |* N! q6 l( u
; R; [& l+ r6 I: Y: \) I% x4 b
_Num += 1
' ^. e( i* u6 x9 o+ l2 v d9 c7 d #b = item.find_all(text=re.compile("Dormouse"))" e+ I# x1 \) W! k/ M8 ?- H
#pprint(goods)0 \( _+ M5 n$ P& L1 @
#print href/ P: F; p( |$ }* F! i3 C' n' h' @4 Q
#pat = re.compile(r'title="([^"]*)"')
& v( o" X$ a0 M+ a #h = pat.search(str(href))( c6 ^! z* C& {* f1 E( [
#if h:8 L" g# d& K6 ^' g# L
#print h.group(1)& E2 |" |2 e9 v, T
#temp_data.append(h.group(1))
5 t9 L( q8 U4 g6 r+ k3 ~ _DATA.append(temp_data)
% h2 x& N8 L0 _5 {
5 [; Z# B: O# b" C#headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)& w8 Z w+ r2 a3 c" Q8 f
#all_url = 'http://www.mzitu.com/all' ##开始的URL地址
, t2 R/ p% j. X#start_html = requests.get(all_url, headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释
L" C$ V( u. L! \0 [#print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text)3 P- E! k! q% b. v. a( V! u4 q
$ A8 N( ^2 l8 }: C o% C2 {
def main() :
: V* S n" A. W; h6 a/ ? e global SHARE_Q
: {8 d- }0 f# ^* k- v0 r$ d9 X- R! w" L threads = []& Y' m, D% }$ A6 m
start = time.clock()" T4 U2 Y/ c: }7 G8 X( U. J3 e
douban_url = "http://company.yktworld.com/comapny_search.asp?page={page}"
8 _1 I& F9 l$ V5 \4 h #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务
$ h& y* j. v- [$ w for index in xrange(20) :
y, K# Y4 }: _" W/ i4 i SHARE_Q.put(douban_url.format(page = index * 1)); k' h. [! O% y8 w6 u
for i in xrange(_WORKER_THREAD_NUM) :. [0 O1 a. A1 u" k" f
thread = MyThread(worker,i)3 {7 O1 }: n, M2 I+ L, A% ~' z- B
thread.start() #线程开始处理任务3 m$ Z& ~0 b) t: ]9 r7 X+ f
- C' [. B; J) p2 w
threads.append(thread)1 Q7 N3 s$ N2 T9 D! j/ a
for thread in threads :
/ r2 p7 |5 N b+ d5 x thread.join()
8 `2 S) |8 B9 G" }5 c' { SHARE_Q.join()
\3 p$ w$ x/ ]9 ]* o5 ^/ f8 e i = 0' m9 k# d- U! o& m+ {6 V0 N" U
with open("down.txt", "w+") as my_file :) `# g8 e4 f8 x4 P' \
for page in _DATA :, W" n0 T0 J3 P% i7 T0 W
i += 1 T7 ]/ o4 | |2 e/ V \' R7 W
for name in page:
( \4 t, [1 \& J1 d& Y8 X# m my_file.write(name + "\n")0 R; Q8 R( G0 G9 F5 W' _" k( _ V6 W
; Q5 j. X3 R' J0 \ V print "Spider Successful!!!"/ [' z l4 |" G( i. q2 D
end = time.clock()
7 k7 I+ `5 e2 M1 q print u'抓取完成!'0 h2 n/ `3 @0 v: N. z3 R4 q @
print u'总页数:',i( }7 S* j, a1 Y! o* r
print u'总条数:',_Num
2 S- k+ g7 u$ l3 y3 }0 W print u'一共用时:',end-start,u'秒', S4 N' V& s5 r8 G
* r3 b' H: v' o+ q+ \% f' h! \if __name__ == '__main__':! {$ m+ f+ S' Z& S) | G, W
main()6 j& o1 \4 a4 s
% n6 r7 C9 O' P$ G; G% t; n2 X& {
1 \& }5 p3 a& G
|
zan
|