- 在线时间
- 480 小时
- 最后登录
- 2026-6-1
- 注册时间
- 2023-7-11
- 听众数
- 4
- 收听数
- 0
- 能力
- 0 分
- 体力
- 7823 点
- 威望
- 0 点
- 阅读权限
- 255
- 积分
- 2934
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 1174
- 主题
- 1189
- 精华
- 0
- 分享
- 0
- 好友
- 1
该用户从未签到
 |
#!/usr/bin/env python
9 q2 h" ~) j8 I5 h% |# -*- coding:utf-8 -*-# a5 e' o7 p& S& c
"""
, H/ N6 M0 G- d6 O9 F# @# SPython爬虫,抓取一卡通相关企业信息
; u# O; j% R2 ^. Z( ]& hAnthor: yangyongzhen
8 C, m2 c0 I% Q) q+ ]) {4 J& P; pVersion: 0.0.2% Y5 J; _' E- w' M3 j! J3 G
Date: 2014-12-14- ]: x; M# |$ T( e1 R
Language: Python2.7.5
: Y! r3 m4 j- f* \% BEditor: Sublime Text2
8 ?$ H1 z! c8 P- n"""
7 Z* ]* z4 y2 g
. ]; @5 `: b8 Y& q0 |import urllib2, re, string& A/ Q4 S" \0 j* b
import threading, Queue, time$ R4 z* h- ^5 F+ b) w+ p2 t
import sys
! h4 R5 g& f/ n& ~import os
* \) [% D# Q7 j5 O" Cfrom bs4 import BeautifulSoup3 x' a3 q c; g
#from pprint import pprint1 L7 A. |2 A0 B3 Z: g$ k4 f9 ~
! _% w5 L0 z- _# o/ O
reload(sys)- t& ~ W* [# @
sys.setdefaultencoding('utf8')
6 d0 a& b* b# K# g) d_DATA = []; H0 r9 U F: g+ I$ O8 `4 i8 ]
FILE_LOCK = threading.Lock()
5 O+ k* f4 _) L8 H# M( FSHARE_Q = Queue.Queue() #构造一个不限制大小的的队列
( i! N9 c1 I( i' V& y* t_WORKER_THREAD_NUM = 3 #设置线程的个数
0 ? k R. y) R
0 U4 H5 j) i1 A! R* ~% b8 F3 ]/ ]# N_Num = 0 #总条数
1 e# l8 }' H6 Eclass MyThread(threading.Thread) :5 s8 Z4 |, v( o
* }% g! m1 B( @! Y2 Y' N$ `
def __init__(self, func,num) :2 D. c1 S9 s9 E1 C' i' m. P# E& ?
super(MyThread, self).__init__() #调用父类的构造函数
1 i8 [8 \& W* e$ v { self.func = func #传入线程函数逻辑
x& `; y. H! [ self.thread_num = num
- Y' `. x) ^0 C def run(self) :; F) p2 b& ?" \( v+ s
self.func()1 s7 W; j7 d5 d& q b/ L# t
#print u'线程ID:',self.thread_num6 t0 B% R1 K' B8 z. o9 F8 g
! p! E' X" Y9 X* Cdef worker() :5 u' a7 _# c+ o: e
global SHARE_Q( i. D9 w1 l; R2 h
while not SHARE_Q.empty():: m! {7 R c5 X1 S2 K) D
url = SHARE_Q.get() #获得任务, q8 \( p1 A6 F. F6 w& I
my_page = get_page(url)' ^6 H! Q* ~; D- m5 y
find_data(my_page) #获得当前页面的数据' z* K+ N/ N8 J# V7 A0 r9 Y: R" S
#write_into_file(temp_data)
! u; O$ J+ q+ U* g( L6 e0 p; T3 s time.sleep(1)( a3 f% M# D& D$ V
SHARE_Q.task_done()6 D' `+ o; ]0 C. G; s2 O
0 p0 A( T8 f5 j9 }$ q0 X& z' Wdef get_page(url) :
0 |9 d1 Y" E$ q$ s5 x. @, Z """8 x* w0 q( D, q% M3 G* Q$ G0 ]
根据所给的url爬取网页HTML- {, R% @/ H, p
Args:
8 X% @7 p8 Z9 a8 r8 q url: 表示当前要爬取页面的url( B, D5 h: g$ U0 l2 d# j
Returns:
% ^2 B, N! f) Z m) D8 {. p 返回抓取到整个页面的HTML(unicode编码)4 T; ?: [) A. M
Raises:. C" v' A! b1 ?
URLError:url引发的异常! B8 F5 p5 N! u" ]5 B
""". `- l5 C* D/ b6 M# ]: K% R
try :1 q! [3 |7 d8 j/ W
html = urllib2.urlopen(url).read() r) a( r6 ], i8 k
my_page = html.decode("gbk",'ignore')* p' J: {+ ?" z+ ~7 p7 T8 O6 r
#my_page = unicode(html,'utf-8','ignore').encode('utf-8','ignore')
9 u0 |/ E# X# o5 d* m #my_page = urllib2.urlopen(url).read().decode("utf8")
, y2 f$ E @8 ~7 O; @6 U except urllib2.URLError, e :7 I2 K+ ?2 R& k& v% i) u
if hasattr(e, "code"):
* A _/ q, S! v& H6 W print "The server couldn't fulfill the request."" _3 D! F6 H* G" X4 U, U
print "Error code: %s" % e.code/ J1 H+ @7 @+ }7 y+ f5 V
elif hasattr(e, "reason"):
6 i; K2 I/ W! {9 k& I6 S7 C: y print "We failed to reach a server. Please check your url and read the Reason"* o) _, Q" A4 ?7 C3 a6 ]7 h
print "Reason: %s" % e.reason1 ^: D- f+ q8 C, f2 D. [! \
return my_page3 D2 l9 D& p3 a4 D. e2 |
" q: `/ J+ e+ U8 n* a) G) X- P: H
def find_data(my_page) :
# I8 S( N7 c' A$ ]0 J """
0 _! v2 M5 t$ P9 h! k9 k8 _2 g 通过返回的整个网页HTML, 正则匹配名称
: c3 M% p. ~0 f) S x8 l
7 o( r; x, U% [1 I3 d) m Args:; O- K1 j+ T i2 c
my_page: 传入页面的HTML文本用于正则匹配
9 H% t, f4 Y1 c7 t( U( |3 I, p """ m& C1 |8 ?% t' V' K
global _Num+ r$ [$ `8 G( Q2 L) r+ y
temp_data = []: b8 s4 A$ z1 U8 d f! Z* s* A
items = BeautifulSoup(my_page).find_all("div", style="width:96%;margin:10px;border-bottom:1px #CCC dashed;padding-bottom:10px;")
( Y! f, {; J! D& g for index, item in enumerate(items) :1 ]" G& {$ q& W5 f' t; H
#print item
; h% y6 C7 O( k6 Q$ Q* _ #print item.h1
0 O1 |. e2 p. r+ e #print h.group()5 b! B$ t+ H: W2 C
#temp_data.append(item)8 j9 K* m" v- s7 H' m5 Z$ h6 H V; W0 H
#print item.find(re.compile("^a"))
" G" k% Y( k, _) G href = item.find(re.compile("^a"))
7 y! ] r: L4 ^+ b! m #soup = BeautifulSoup(item): T- p( u5 R. h- S
#公司名称
( U$ x' B+ T+ _6 v9 K$ H if item.a:
0 C' O+ B3 p/ c0 n7 i( D3 M: \ data = item.a.string.encode("gbk","ignore")
! [7 S Q" U5 v" y print data _8 L* ^2 I" n2 E2 _
temp_data.append(data)
$ [$ f1 ^6 k1 a& d; }, Y' K" J8 H9 ^8 m
! v6 R1 L0 I' _( U9 g" Z( W, S3 g* M0 | goods = item.find_all("div", style="font-size:12px;")
I* g ]9 o7 w1 @! w& Q
+ v, A& k7 u7 h #经营产品与联系方式
& Y+ y8 Y j: E7 L- w. n! @, z* G for i in goods:5 \" e- }0 _) p5 T4 T# s) Y
data = i.get_text().encode("gbk","ignore")9 y9 s6 U0 W1 X, |: n
temp_data.append(data) `4 q" F K5 q3 g
print data4 z! B5 U7 V: i# }# @5 m, g
#b = item.find_all("b"); J: v/ x) C9 ]$ D0 G/ z
#print b @; X% P2 R4 u4 @+ T
#链接地址
8 x8 Y4 @" k1 p: d P$ d pat = re.compile(r'href="([^"]*)"')
4 N1 B$ x/ }/ T1 v. f6 f$ j h = pat.search(str(item))4 y# P- l$ }6 D* n; l+ B$ j
if h:
% V0 u0 u9 Z# ?4 L* D9 g #print h.group(0)
$ U& e* p5 [4 x! x$ a1 G% n* p href = h.group(1); l N0 t# P! P$ |4 T2 M
print href
/ G5 C) R4 E( u* E5 R% E5 B temp_data.append(h.group(1))% `2 \" q& Q; o5 ]( l
" n+ ^ K- a0 Z4 J2 H
_Num += 1
" X1 g0 I0 U5 J# J! P! b4 l #b = item.find_all(text=re.compile("Dormouse"))
- I! H7 z) Y$ Z8 @3 }8 o #pprint(goods)
( U" |' j3 R. j" n #print href4 a; H/ l) W, c4 P: Z3 J" m0 l1 k$ O
#pat = re.compile(r'title="([^"]*)"')
7 h+ @* L) \8 @$ ?; ]5 | H# F- i #h = pat.search(str(href))
4 B# K+ M* [& F5 a5 ~& Q #if h:
2 X" `* o( i! ?' b5 T #print h.group(1)7 K y( ~6 s$ s/ O9 t
#temp_data.append(h.group(1))$ H* \) Y& C0 ?: N/ H8 _6 R
_DATA.append(temp_data), B2 c/ |7 e4 @; L8 X& x& h" k; A
; g1 F# y' }/ |# E#headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)9 V8 R Z m, l% w+ o& D) e
#all_url = 'http://www.mzitu.com/all' ##开始的URL地址
- E1 b- V7 M" Y4 [7 D) s#start_html = requests.get(all_url, headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释! a( p7 A: ?% P
#print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text)4 K- t# D0 B0 Q
, l& n& B$ u% Y3 v2 Z, r+ y* [: @
def main() :) |7 j4 q/ g3 q2 g& n
global SHARE_Q5 R+ @6 v4 H- W, G
threads = []7 x. }" }0 C9 w. X8 O" _- u' H
start = time.clock()# W8 j2 r+ ]1 k3 P; C
douban_url = "http://company.yktworld.com/comapny_search.asp?page={page}"
& v+ p& A! L k7 |2 w$ G #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务
5 ^3 y0 `; e- \4 p- D j! Q3 \! c2 V+ Z for index in xrange(20) : 3 a& |. V! |& ]) B' j) P& A
SHARE_Q.put(douban_url.format(page = index * 1))
0 H4 K; ?2 k8 J" L2 D7 G' W6 i4 ` for i in xrange(_WORKER_THREAD_NUM) :
, O4 ^% g% t3 a) L0 t thread = MyThread(worker,i)9 T! J2 C7 b% T n
thread.start() #线程开始处理任务
C1 w" U# c' j' |' _ l) Z7 X7 h2 G m0 s: i9 }' g! F t
threads.append(thread)
! ~7 m' h6 Y4 a( q; i7 F0 | for thread in threads :
0 ?& l! u O; l5 I thread.join()
N1 Q: J( R' x3 r& G SHARE_Q.join()
& _3 v- D: {) @ i = 0: M( g+ @5 B I9 T3 Y' I
with open("down.txt", "w+") as my_file :* N6 n; b. f: K$ A! v
for page in _DATA :
% r( k. b5 n" H i += 1
# m* b' Z8 z( R! }, }& P9 X for name in page:1 b8 `1 K% f2 X1 j9 \+ c! d
my_file.write(name + "\n")
# F6 y7 x& `5 m/ b# H, |" d7 M
5 ]* ?0 J% P$ x8 G! w print "Spider Successful!!!"
) ~# [- \3 P# {0 W) e/ e end = time.clock()! L5 \/ e' H# D/ p* Y2 V
print u'抓取完成!'
2 W4 m# h* c/ a: s' o$ s print u'总页数:',i% ~( q5 T5 D5 `; f \% ^& L/ V& g
print u'总条数:',_Num) H/ `7 O' b9 x* F- c% L
print u'一共用时:',end-start,u'秒'
, o$ y9 f0 _( C( Q+ h7 V# Y7 V" Y8 D/ ]+ o% ?9 ~9 a1 U( y
if __name__ == '__main__':9 L, i% o8 z# g3 }/ B
main()' d5 J' Y- L5 N
( |/ L1 X, N; G) G V
% r/ {2 M* o5 Z5 [7 c% N8 ] |
zan
|