- 在线时间
- 479 小时
- 最后登录
- 2026-4-17
- 注册时间
- 2023-7-11
- 听众数
- 4
- 收听数
- 0
- 能力
- 0 分
- 体力
- 7790 点
- 威望
- 0 点
- 阅读权限
- 255
- 积分
- 2923
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 1171
- 主题
- 1186
- 精华
- 0
- 分享
- 0
- 好友
- 1
该用户从未签到
 |
#!/usr/bin/env python
, w1 k; D( C0 P- k4 |& a# -*- coding:utf-8 -*-
, e' |3 M1 h( o$ X- V! I) E, `; C""": G+ \5 {6 ~# F5 n$ l3 x# e
Python爬虫,抓取一卡通相关企业信息
, n0 E. h! x* G* j( E( A0 F6 sAnthor: yangyongzhen9 u# _7 h ^- E. O: \3 F
Version: 0.0.2' y' Y0 ~. m7 A [% s& v
Date: 2014-12-14
5 f/ `' A( _2 e5 M; W% M2 `: }3 B. NLanguage: Python2.7.5
3 f; W+ O/ `( c! y+ X% D) U! WEditor: Sublime Text2
9 b& y4 A2 G! a2 t4 i! C" o""" H8 k! Y' H, L- X/ ?0 x
4 ~- `$ O; Z6 \4 R& K2 B, b2 b
import urllib2, re, string
1 G, R7 w' I) ~) l! ?" I5 Yimport threading, Queue, time2 l0 I) c" p! V' v6 N
import sys
/ B+ |1 U; h! j5 {& k; wimport os
& f' d- g! {- j/ f* `& W/ t1 Ifrom bs4 import BeautifulSoup/ u) Q2 t7 J s+ X* W& \
#from pprint import pprint) S a$ l( t9 M; Y! o3 c% h
; x. i2 q* Z& N
reload(sys)2 @' D* i K W8 `0 [8 S; `. P
sys.setdefaultencoding('utf8')
4 r7 g4 m8 }7 h: C1 C8 ]_DATA = []
* ^# M6 t! i( }; J9 J2 FFILE_LOCK = threading.Lock()
, z; e" S4 e" E* i' B! K Z. |0 a5 k0 hSHARE_Q = Queue.Queue() #构造一个不限制大小的的队列; F$ r* Z. e, E; ^, L( _4 G
_WORKER_THREAD_NUM = 3 #设置线程的个数
/ W3 j% B1 q* y
& T4 T) k" _7 s9 {8 i_Num = 0 #总条数
5 t+ e- ]3 t1 ?, D% Kclass MyThread(threading.Thread) :
% s3 i$ p7 n6 s* y1 q
% {6 T& [- \, t: c) f) k def __init__(self, func,num) :
2 }; q2 R* ]; P3 I( m super(MyThread, self).__init__() #调用父类的构造函数9 M& y! y' T' ~( {& w! K7 E$ v
self.func = func #传入线程函数逻辑) d4 y, _: u* p' p% P* ^2 D0 _
self.thread_num = num % ^; J. n( _& Q a& `
def run(self) :2 i9 v; _! f0 I+ L$ ^2 c: L! O& [ G
self.func()/ H, K4 g' Y e2 D- m
#print u'线程ID:',self.thread_num
% ]1 \ C3 a$ B2 P2 G% @$ ^& e( b1 v8 E8 Q2 R
def worker() :4 [: w/ `/ Z Q% M; r1 n( e
global SHARE_Q- U9 M( l) ]: M5 O, T
while not SHARE_Q.empty():; L7 C3 T/ r, ^" l" t9 I
url = SHARE_Q.get() #获得任务! C% h: B9 t, x3 Y" p' ?* w2 _
my_page = get_page(url)& X" N- c5 k3 E0 }
find_data(my_page) #获得当前页面的数据) c' D' C$ s5 q) V1 ~
#write_into_file(temp_data)0 D5 Q' U9 |( e9 m) F6 {
time.sleep(1)
7 d! o! d& Y9 M) q: U! u5 l3 G SHARE_Q.task_done()
8 x0 O1 F/ M& d& A! Y8 \
7 s8 l4 J! A) F, L+ k2 hdef get_page(url) :
' D3 u% M4 y0 e% m @ """
7 j |$ l C& \- Y) @$ g 根据所给的url爬取网页HTML
6 I& e. I h5 N F& ~0 w' d' X Args:
; n; d2 A$ J1 U( F2 y. t5 ^$ q url: 表示当前要爬取页面的url
3 a+ K) O7 e: P( s9 q' i Returns:
) n9 P" M' u) d& w4 h8 L8 l 返回抓取到整个页面的HTML(unicode编码)
/ R! B/ E3 Q( }# O5 ]; D: [ Raises:" I! j4 l% S, f
URLError:url引发的异常
* O4 @8 y/ M+ Z3 z# d$ s """
$ r; B0 o; N" f7 y/ F3 R try :
% F# T" ^) `9 L8 L html = urllib2.urlopen(url).read(): i6 r; B. A& M+ p; y
my_page = html.decode("gbk",'ignore')8 Q! a$ }. L/ W7 n# ?& d
#my_page = unicode(html,'utf-8','ignore').encode('utf-8','ignore')
N& x4 ?* Z; H3 P: ? #my_page = urllib2.urlopen(url).read().decode("utf8")
- ~6 c' X& i2 f. v3 G except urllib2.URLError, e :9 F: y) j+ d6 Q# x$ K+ w+ D( s
if hasattr(e, "code"):7 n& D2 t+ x+ T4 ~! _( d& k
print "The server couldn't fulfill the request."
& m& n2 ~" a8 c print "Error code: %s" % e.code
% Z& e: V! m9 [ elif hasattr(e, "reason"):" |/ X9 c$ K8 z- u# N
print "We failed to reach a server. Please check your url and read the Reason"
% t& C4 K0 N9 `# i1 | print "Reason: %s" % e.reason
8 `2 |. M- ~! `5 a0 I( g$ a8 `! d return my_page, j' i' D/ T4 T. q
4 g: x$ d7 V% A4 D! T1 d0 |def find_data(my_page) :
! K4 R' x0 ^: X7 y+ u """
0 o; A9 G2 p0 T7 L" f 通过返回的整个网页HTML, 正则匹配名称' P2 D2 x. _, _. [( U, z
/ B, z, W$ H @' ?$ O Args:
: y7 O' E0 N0 Q8 n! A0 h, W my_page: 传入页面的HTML文本用于正则匹配
1 _; F, _# q2 o- e& L3 m0 n) p """
% T) N: Z% u8 L# T6 } global _Num
4 H) G; r0 B' j2 ]' x temp_data = []+ E' D4 A$ {8 k
items = BeautifulSoup(my_page).find_all("div", style="width:96%;margin:10px;border-bottom:1px #CCC dashed;padding-bottom:10px;")0 v3 r8 v, Y- g7 [9 R; Y+ R6 i0 q
for index, item in enumerate(items) :
; x: r! k0 X& j) C O #print item" J) p7 @' J$ h9 n. V$ A
#print item.h1
2 R5 O" X" [. }; N. Y #print h.group()
' \* x# Q+ c! w& e" Z% h+ D$ E* V #temp_data.append(item)$ L/ Y/ O: W' }- z) _
#print item.find(re.compile("^a"))
- Q4 R7 ~4 J8 k5 g8 X7 v- F% F href = item.find(re.compile("^a")). p+ g" ~! H: s) w/ I7 g, g
#soup = BeautifulSoup(item)4 a0 X1 L+ B) D
#公司名称' { J9 L. F. o6 E( T. f$ x, K
if item.a:
$ L; @+ H* t6 q data = item.a.string.encode("gbk","ignore")
3 z) _+ P6 f1 F' U: X print data
0 i f, b7 C) ` temp_data.append(data)* x1 R! o+ q' j: I; |( n- {
: {/ @6 b) @3 ~/ {( y6 p goods = item.find_all("div", style="font-size:12px;")
" f) [# ^* `5 W( u- g/ u8 S
3 L8 `% `+ |; K6 x0 ?# i9 } #经营产品与联系方式4 W) u5 V( y7 d/ r$ f" Z( [
for i in goods:
6 H9 p; D5 c" k8 i( ^' C5 N data = i.get_text().encode("gbk","ignore")
& M# j9 t% ^# F3 y9 a) n+ l6 o1 s9 A7 @ temp_data.append(data)
" q2 L0 f/ |0 k# f B print data
/ k5 g. J' t& i. w3 {" c #b = item.find_all("b")) E: g$ q& E7 E7 L- \* C$ k& T
#print b' @( v7 W" v, T
#链接地址
' P9 G4 u' i7 p1 h pat = re.compile(r'href="([^"]*)"')
2 B. X/ y L7 a8 } h = pat.search(str(item))1 p P% ~$ m7 E$ T1 E0 `
if h:* M/ {0 s; n" b. R! E: H
#print h.group(0)
) T! l! n4 P7 V! k7 U2 v: C href = h.group(1); `2 K& t9 l2 i* i
print href5 k$ Q7 P" v; t6 u8 w1 C
temp_data.append(h.group(1)), E) D0 p( J6 S* L, @; |
: E* l/ c: d- W3 j! Y S _Num += 1& S; ^2 D# a) t. s
#b = item.find_all(text=re.compile("Dormouse"))
: s4 V* P' k( b% B. h #pprint(goods)
2 s$ _) h% y7 t #print href
. E1 m- o; P: f #pat = re.compile(r'title="([^"]*)"')1 O7 V! l% n) M4 V+ m
#h = pat.search(str(href))9 M+ {- V" E" ~# A4 O3 e3 O9 k
#if h:! D0 I) P5 i; e/ t5 E, Q8 v2 y
#print h.group(1)
1 W2 l9 g4 X6 ^. L9 {$ k #temp_data.append(h.group(1))
9 |" z9 [, R. T2 {9 n V) J" U& [ _DATA.append(temp_data)
: H$ _2 n4 U# [/ X' ?9 p
3 w, U: t( I' B% W* W#headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)
0 L( b" o2 i9 l$ G2 z% k9 c#all_url = 'http://www.mzitu.com/all' ##开始的URL地址
. E% q- F( Q) X2 P#start_html = requests.get(all_url, headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释
$ R p; b7 v9 O#print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text)
L+ R* U9 v* K: a4 E4 G% `/ q, e2 {
( q4 z& |% i' f3 Q4 T; K8 Xdef main() :; {/ b2 m6 Z% o# w' Q F# F; _
global SHARE_Q% ~" _+ j8 Z- i
threads = []
: s' u/ H8 F. a+ D1 U6 N5 o start = time.clock()4 y# H2 ]9 z) d9 E) Y% T
douban_url = "http://company.yktworld.com/comapny_search.asp?page={page}"9 g5 B% c+ q, Y& H( J* }, c- i
#向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务
9 |# v# n* t4 Y8 o for index in xrange(20) :
& e$ q5 U: H5 n1 R; M% @ SHARE_Q.put(douban_url.format(page = index * 1))
# l8 R% w/ x' B; l! ~3 d for i in xrange(_WORKER_THREAD_NUM) :
" H+ p) M, c1 J# j thread = MyThread(worker,i)
}1 D/ M* }8 a8 U) p% P5 m7 E thread.start() #线程开始处理任务- p; k& M' c! B* C
$ u; L; s: |2 D" i, Y B& w' I; y
threads.append(thread)7 j1 C+ t n, I* h. h: g6 P
for thread in threads :
2 K9 \3 Z. d% `+ N2 J thread.join()
) U; |7 x2 k0 r" I SHARE_Q.join()/ i7 E# a& C8 f( ^6 f5 K
i = 0
1 n4 O. |8 G: Q) r" k2 b* h1 s with open("down.txt", "w+") as my_file :- a" n" c4 [$ H3 s
for page in _DATA :4 s' B# p. r+ x0 n4 K! G/ ^2 g
i += 16 ]2 q. n0 Z8 M
for name in page:
( o q; g+ n3 L; ?: |9 G my_file.write(name + "\n")
( D/ {- X/ |) t& o8 a( l
" P0 s# m8 n& @% u" M print "Spider Successful!!!"
' q) R* W8 N$ L/ ^ end = time.clock()- N9 Y* d3 V5 T
print u'抓取完成!'
. n3 q% x0 U+ R& c$ d print u'总页数:',i1 e4 H9 O( P- g% u" ~6 n {
print u'总条数:',_Num
8 n0 }( G F( y print u'一共用时:',end-start,u'秒'4 U; X4 P( }( K& ^( x. J5 C
: |+ B$ x" [; t" I+ j
if __name__ == '__main__':
, F, ]5 \1 N7 F6 v8 H' s main()
" x3 Q6 [& M& M/ F7 C
* R! }6 H$ p$ ^5 R6 c) d7 W5 |4 w
|
zan
|