- 在线时间
- 479 小时
- 最后登录
- 2026-4-17
- 注册时间
- 2023-7-11
- 听众数
- 4
- 收听数
- 0
- 能力
- 0 分
- 体力
- 7790 点
- 威望
- 0 点
- 阅读权限
- 255
- 积分
- 2923
- 相册
- 0
- 日志
- 0
- 记录
- 0
- 帖子
- 1171
- 主题
- 1186
- 精华
- 0
- 分享
- 0
- 好友
- 1
该用户从未签到
 |
#!/usr/bin/env python
5 d, I* l9 }/ o ~/ K* `$ E# -*- coding:utf-8 -*-
! x! b) v9 L1 i6 H' n"""+ h' }4 z& s9 L% T$ H
Python爬虫,抓取一卡通相关企业信息9 M4 q* o$ q. W' z5 V: N8 Z
Anthor: yangyongzhen* i6 G+ z. e/ R2 _
Version: 0.0.2% A- `' |' y' a' D$ a
Date: 2014-12-144 M, u7 R8 v3 f& N9 I
Language: Python2.7.5" n: M6 q; @1 F) f5 |. y5 w5 k$ u
Editor: Sublime Text2" A4 y4 G9 `7 j Y2 v0 ]
""". Q/ o5 Y6 H: s: R0 a! D! U
7 a3 n" t: L& W0 n
import urllib2, re, string1 u/ c# _; s3 B, g2 w S
import threading, Queue, time: X& [1 j/ R6 y6 y8 N" B# m
import sys9 t9 S+ ^3 `# [
import os7 U4 i/ |; D* T: H6 G
from bs4 import BeautifulSoup
% S8 {: E& p5 y" u#from pprint import pprint- b0 g6 J9 s# E0 d* a
; v$ e- W8 ]+ \3 D8 `reload(sys)
& x* N+ V3 z& u% gsys.setdefaultencoding('utf8')
. X2 U, G! K5 u' p_DATA = []
" m1 t- S8 e$ q, p8 I! C3 bFILE_LOCK = threading.Lock()8 m) ^9 H3 M) ?' U
SHARE_Q = Queue.Queue() #构造一个不限制大小的的队列
- E8 x9 R- w* f) ^_WORKER_THREAD_NUM = 3 #设置线程的个数' N. |( }- ^5 x* L* i" v6 @
, N7 s/ |# H) `_Num = 0 #总条数
0 z4 g+ f1 s; B$ ^) Hclass MyThread(threading.Thread) :# P6 v5 W2 L3 L0 A: p
4 n& z1 k: @( p+ E! k+ j def __init__(self, func,num) :: S) }5 ?$ z3 H( v. m: O
super(MyThread, self).__init__() #调用父类的构造函数, v- n/ P+ J" O% u5 }) v
self.func = func #传入线程函数逻辑
: S m8 G" C4 s. o self.thread_num = num
" U( M% x% d8 t# Z* D3 W; H def run(self) :$ |$ O2 t$ i; ?' t% N- v z
self.func()0 f3 O; P% s* L B+ \0 ]
#print u'线程ID:',self.thread_num
8 L* X8 \, E2 _9 ^2 \! C6 [' K- C0 K
def worker() :3 D7 v( m% s# q0 D" ~( I" G
global SHARE_Q
* ^2 @0 `* _. M8 p- l G while not SHARE_Q.empty():0 s9 d5 o% F' |
url = SHARE_Q.get() #获得任务
! |) B9 b7 x. [% X. D+ W9 V. u my_page = get_page(url)" c/ o8 ] G, j5 a/ _
find_data(my_page) #获得当前页面的数据" s; c8 [' V9 ]5 p+ O
#write_into_file(temp_data)7 x7 [/ U7 j# q( Z4 s
time.sleep(1), Z# x0 x8 J( m/ y* ]8 g
SHARE_Q.task_done()+ I5 ]: n3 n' X" F" K6 i
; }! O& |% q+ Q, F" k, W- U7 {+ {
def get_page(url) :( X4 ]! J: N$ o& F
"""
% @: S) d- Y" t* g 根据所给的url爬取网页HTML* N: h/ o( n7 ^$ X# H; F5 @
Args:
" l# l- [% q* i, M, f6 _& J- b url: 表示当前要爬取页面的url& k! Q/ L) s! }+ ~$ F1 }9 H
Returns:
2 {( ]6 q1 }- U) x" Z$ p 返回抓取到整个页面的HTML(unicode编码)
l# c- _, [( L& W# w- P/ N Raises:
5 a2 C) F0 v0 ~ ~2 {. E URLError:url引发的异常
3 g9 W) Y* w1 w6 s, r1 Q """
, q( P+ u$ |% x# N2 Q t try :, g' }% J8 A( s3 N- }. ]& G
html = urllib2.urlopen(url).read()9 k4 |/ O# t- h( W* A
my_page = html.decode("gbk",'ignore')
. s5 D) x! }( A! c, n% o #my_page = unicode(html,'utf-8','ignore').encode('utf-8','ignore')5 z0 c2 |8 c% e$ |1 z
#my_page = urllib2.urlopen(url).read().decode("utf8"). _1 Z b9 L7 T
except urllib2.URLError, e :
" N+ V/ u6 ^ {# ^1 K if hasattr(e, "code"):
1 G9 k* I, @" z- z" Q& g* r8 o; g print "The server couldn't fulfill the request.") @0 ?. |6 B% p8 s! q' J r/ R
print "Error code: %s" % e.code* N3 X4 D9 g& |9 x: Y
elif hasattr(e, "reason"):
; B. ^1 p3 J. c) }/ M- M print "We failed to reach a server. Please check your url and read the Reason"
/ G' v5 b8 F5 G( A, b) Z print "Reason: %s" % e.reason
6 i: I: Q% G b/ G, T( j return my_page: S" |- K, W/ G0 }5 v3 E
2 }8 l# O6 \/ vdef find_data(my_page) :
, B1 x+ p8 M, V8 i% h """
4 W2 ~ ~ O$ X$ F 通过返回的整个网页HTML, 正则匹配名称1 p; i7 ^" a& r0 k. z
g% L7 t3 n7 p1 P$ F# f& K0 u Args:
$ v0 p( @4 r( F+ l: M8 m$ I3 s my_page: 传入页面的HTML文本用于正则匹配$ M3 C$ X# B, M
"""
' V4 @1 q' X$ m6 a; W, \ global _Num
5 C+ s% f2 ~6 E$ m. { temp_data = []
+ `7 o: a, i! W4 Q ~8 G2 D: J items = BeautifulSoup(my_page).find_all("div", style="width:96%;margin:10px;border-bottom:1px #CCC dashed;padding-bottom:10px;")3 a: Q m6 k7 G: E% V/ R' g
for index, item in enumerate(items) :$ F/ g4 T: Z s1 c
#print item+ ]& s- n: I3 s, u- s
#print item.h1
" R: x9 T" _& J6 G( c8 v4 e6 m #print h.group()
9 L1 \$ j. M7 @, ~ #temp_data.append(item), u5 ~# E) H/ o2 _
#print item.find(re.compile("^a"))( t) Z4 P! c/ d7 y( M/ V! n
href = item.find(re.compile("^a"))# `/ ?" ]/ Z0 q* D
#soup = BeautifulSoup(item)2 z. i+ I- {( g' b, B9 P
#公司名称
0 z' h, e6 I: Z& I; v H, F if item.a:
6 e8 s' P. Q5 K/ d+ b. u data = item.a.string.encode("gbk","ignore")3 W+ q7 q% M" e& g
print data
7 E$ n3 _' A) h+ P temp_data.append(data)/ K" C9 f4 s: Y$ f$ m" H( |+ r r
; B0 R$ A! |% S, j goods = item.find_all("div", style="font-size:12px;")
- ?7 N3 Q+ x* ?, ] $ j: w/ J, T! i( i% C# M
#经营产品与联系方式
( T3 c0 v- P; k$ h! G5 Y$ K- Y! f for i in goods:1 L- P% x- b* @; j' H
data = i.get_text().encode("gbk","ignore")( R' i& K- K( Y7 ~/ X. B% d1 `) \
temp_data.append(data)
2 q+ u# y, ]5 W# a print data
& E+ M( H( M" x2 U" g8 a #b = item.find_all("b")
; F, h5 e) L* U0 o- U #print b
- N; U/ ]" y- a #链接地址
% G" u) }# m/ h J; V pat = re.compile(r'href="([^"]*)"')
. v, w* e1 y3 L( ?, v h = pat.search(str(item))
( p$ Y0 \$ m. y! R if h:) Q2 w/ u# |! l6 S5 v
#print h.group(0)
& N- W" E A2 C/ V o* ? href = h.group(1)
5 ^; Q& [. k4 c7 q) x8 x print href
) D. Z) b: ?5 v) E3 z temp_data.append(h.group(1))9 s3 U# F3 \) v7 M4 w+ ?5 \, F/ d' e
+ q9 l/ E$ w% B2 E9 `* L
_Num += 1
" ~! P/ z% [. p! u) {, ~& d #b = item.find_all(text=re.compile("Dormouse"))
) z5 i) x0 l) W* t: Q #pprint(goods)3 h0 E9 g4 K/ ^$ A0 `
#print href
$ l6 |9 R6 I" @% q5 E #pat = re.compile(r'title="([^"]*)"'); P- r2 r3 a( F, L, o
#h = pat.search(str(href))
6 c( j( N' H- o5 E6 m+ w R- n #if h:
/ \; K K' ~) C #print h.group(1)+ v' V$ W0 N8 @) O4 S9 c. B( w
#temp_data.append(h.group(1))
# R6 s( P+ Y+ p8 Z4 [ _DATA.append(temp_data)
1 ~+ z7 L5 V+ h& S# v3 I( b( z
2 D3 h& A. t0 C5 v4 U#headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)
' z6 ?2 r A% z" y$ _3 G#all_url = 'http://www.mzitu.com/all' ##开始的URL地址+ O f1 _% J: B3 z6 n0 Q( @
#start_html = requests.get(all_url, headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释
4 c, l' [; W9 M/ a; N- B8 m#print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text)! B; V5 N& w! Q1 u
* i) Y1 i8 q) M: `
def main() :0 o; X; c" t) N% i0 Z3 T' I8 m+ z7 j
global SHARE_Q9 R" l V+ w/ [- x. K. |: {
threads = []
) ]& R/ L: Y& {: U0 w start = time.clock()4 h* ~. Z# o* J6 F" \
douban_url = "http://company.yktworld.com/comapny_search.asp?page={page}"
1 U& u7 q* c% u- O% W #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务8 n! {6 z$ D/ V* J1 C; y3 p) z
for index in xrange(20) :
# v a+ I- P5 w% h+ ~0 y# s SHARE_Q.put(douban_url.format(page = index * 1))
" H" s! I' ^7 u for i in xrange(_WORKER_THREAD_NUM) :
+ F Q* r: v- |# h ~1 } thread = MyThread(worker,i)" U" f& G: s! Y9 l
thread.start() #线程开始处理任务
. _, G, }8 t; I% A( s h9 d9 R# I4 M0 |" l, \+ P# T
threads.append(thread)4 J4 [/ j1 [3 G# e1 {
for thread in threads :5 E% V2 d r' a6 A+ B6 L' k4 _
thread.join()" i$ O- x+ a% ]* Y, m0 ^7 H
SHARE_Q.join()
$ X& }+ Y E( y' G3 j i = 0
( q0 I2 |2 |' }' M [8 x with open("down.txt", "w+") as my_file :
, F+ T) ]- P! I# _" b, u/ S: F- m for page in _DATA :& d, S+ A" b4 q* o
i += 1
2 r: q' N; }) M8 C2 @7 m* @" t for name in page:" C& y9 R2 J# L3 n% {
my_file.write(name + "\n")# h2 R' `" l; c Q& A9 [
0 |% t% ^4 A* S6 X
print "Spider Successful!!!"
; k( i8 p' z: O# }% T! d end = time.clock() [8 L+ D) |$ \0 A8 {
print u'抓取完成!'# P9 o1 S# G' g% ?3 `3 c( V
print u'总页数:',i& x8 G, Q; D/ K$ v, V. [& K
print u'总条数:',_Num
, _* t: k$ A# [$ j print u'一共用时:',end-start,u'秒' z% {# x! c( C
1 Z5 @0 F! P4 H4 |4 Yif __name__ == '__main__':! a. p X9 w( f# S/ O: ]
main()
( h c. p7 x+ f4 W0 f) e4 }5 F: ]
% K8 n. e X$ A8 Z* J* F7 @$ L! h9 u5 g" |: X
|
zan
|