/ h; `) I: J# n' X6 W2 Yfrom scrapy_lianjia_ershoufang.items import ScrapyLianjiaErshoufangItem+ ]( @$ |0 Z1 n
6 Q' G4 t/ a* W! F L% _! C) n' @/ u
class ErshoufangSpider(scrapy.Spider): 7 M9 ^8 M6 D1 E% U name = 'ErshoufangSpider'# |8 [8 v: p' Z; D2 F% `5 V: P2 X% u
: H* d4 ?1 N5 t7 {' _! l7 @ def __init__(self, name=None, **kwargs):9 z/ j3 v+ ~# L' G4 O
super().__init__(name=None, **kwargs)4 b8 G/ b) a0 H& D
if getattr(self, 'city', None) is None: ! Q; K9 P, i* ?; g9 @! p. ]/ O- W setattr(self, 'city', 'sz')4 M+ |* O( M$ U
self.allowed_domains = ['%s.lianjia.com' % getattr(self, 'city')]+ x p t: Q3 O0 T U! Q
! o$ Y4 u% g1 L" a
def start_requests(self): ; j( J4 {# |( E' P' |, d) Z city = getattr(self, 'city') 0 o8 G [' D+ Q b urls = ['https://%s.lianjia.com/ershoufang/pg%d/' % (city, i)6 C+ _+ ~* g% i K
for i in range(1, 101)] 1 v% f, Z5 O+ Q* a3 O& F for url in urls: ! _! Q; f3 v' g/ e2 ~6 m yield scrapy.Request(url, self.parse, headers={'Referer': url}) 7 X" H w: }5 {+ P3 ] 7 X% x6 _) ^6 X/ k1 D7 o, m4 _ def parse(self, response: TextResponse): & d$ ~5 R3 t1 z0 ?9 ] [3 v; { items = response.css('ul.sellListContent li') / V, Q: O; N' ^0 R for li in items:9 v- m: r& A% ^
item = ScrapyLianjiaErshoufangItem()6 n% D/ l' n& T! u7 O; @
item['title'] = li.css('div.title a::text').get().replace(':', '').replace(',', ' ').replace("\n", '')" |, F; c/ y8 i/ c: f) l1 u
house_infos = li.css('div.address .houseInfo::text').re( - X. _2 } A" }7 d8 ^# O& C$ K+ ]+ } r'\|\s+(.*)\s+\|\s+(.*)平米\s+\|\s+(.*)\s+\|\s+(.*)\s+\|\s+(.*)') : G8 h& G4 G3 A- ]& q( t, s item['room'] = house_infos[0] U; V) J: d6 Q' V; ]
item['area'] = house_infos[1] + ~0 H! Z- p4 C0 C8 E* _ o item['orientation'] = house_infos[2] . a7 t7 `5 O/ H) o- v item['decoration'] = house_infos[3] $ ]/ K* |2 X9 i& Y item['elevator'] = house_infos[4]2 {3 B% S( [( L& H8 ^) K
item['xiaoqu'] = li.css('div.address a::text').get()" }( n9 ?: W4 U/ M
item['flood'] = li.css('div.flood .positionInfo::text').get().replace('-', '').strip() 0 g6 w5 N A! `* r item['location'] = li.css('div.flood .positionInfo a::text').get()' X Y: M5 h# c/ v
follow_infos = li.css('div.followInfo::text').re(r'(.*)人关注\s+/\s+共(.*)次带看\s+/\s+(.*)发布') {8 F+ n- m' q8 H" u5 c9 i
item['follow_number'] = follow_infos[0] ) j9 B: h" T% }: p' l6 F" | item['look_number'] = follow_infos[1]2 L; @7 |( n9 p2 V4 {9 t% R4 H
item['pub_duration'] = follow_infos[2] & O+ I6 v, I" R6 z) v item['total_price'] = li.css('div.priceInfo div.totalPrice span::text').get()3 P; M1 O# U% I: I7 ?% i$ u# D) w
unit_price = li.css('div.priceInfo .unitPrice span::text').re(r'单价(.*)元/平米')- U: v, m7 X9 S( J' @
item['unit_price'] = unit_price[0] ; k& X+ d' l+ E8 N6 P item['total_unit'] = li.css('div.totalPrice::text').get() 7 M0 @7 B* x4 w6 W2 B [ item['crawl_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') . E+ O# b. T( J) j, W item['house_id'] = self.genearteMD5(''.join((str(item['title']), str(item['room']), str(item['area']),% m2 C' X- [: R P M
str(item['orientation']), str(item['elevator']),0 O& J* B: l5 K* x8 P7 Y2 ?
str(item['xiaoqu']), , ?7 J5 P0 y( F1 C& Y0 F, Z str(item['flood']), str(item['location'])))) 1 e' `+ e7 }0 m3 l2 t- i; u yield item5 I& |3 y0 D3 [# z4 Y+ O( T2 ?" l
0 \2 A: ?* m: M& T7 P- _
def genearteMD5(self, text): ' S( Z7 m3 s/ f& b1 @! p # 创建md5对象# E# Q% ^, H2 s# l/ ], @4 @0 c
hl = hashlib.md5()* v( Z" K5 o# B
hl.update(text.encode(encoding='utf-8')) 0 o3 f3 a) P- U- L6 P( s, N return hl.hexdigest() 4 B2 T0 o3 I3 M6 V* U( B' y0 a9 H# r8 m$ p; M! y* L
- g+ w2 I6 q; _17 K2 y" ^2 H) q3 c
2$ f) m# k5 X5 m4 n5 `
3 ` T# H' q8 a* C
4; S+ R/ {1 t# Y: c7 `" C
5 ' x4 @2 D# L( R: P* u: P67 Z/ f- Q* g5 `+ F% S
7 & V5 |9 e% T( u, R8 j. B. ~; t84 _. c4 D* b, x, G
96 Y+ Y! y! s2 z
10 $ B- I% b3 @" X4 x11; U- w5 k) z2 Z. L. `, U6 p: {* k+ S
124 P2 s3 c6 [1 w5 F5 T2 n
135 Y( [ }+ C# q! t1 n
140 I4 T. Z. Z' u1 h. Y3 E
15 $ S0 |, g; e, i, Y16 , J/ T' s T$ J! ?178 F4 L1 s4 m6 H7 R3 N
18- L$ y' { K! z( f$ E6 k8 P
196 n6 S( [2 l& C' A
20, m! E3 S6 U- O: z6 c- v
21- A1 M- C% n' q2 C' N9 i% x9 m* E: B
22& b5 u# D$ n0 U& I
23 $ w+ q; R' j& b) D6 K8 z& W/ Q24 2 K: Z9 W7 C# r \& }8 L3 _: \25: l7 J9 L8 S/ r/ w5 {. a) |
26 * U* t' C; N) A1 z27- P+ S2 T- c5 @4 ]# z
28' x$ E) q& b0 \! |# W
29' ]: ]' A4 E" h6 E9 K
30; H! J* W- t: L+ ~
31 : U$ J* y; u+ ?/ @# v- e2 n32) F& C) U( i9 E+ r' Z# x% n: T8 c/ m5 t
33 5 t* Z3 R0 a0 N! ~" Z34 " {' J4 ]+ v( n6 Z35 & I7 L; j& g. `- k* X36$ A2 r4 \5 l3 i4 w2 f
377 S. v0 O4 }1 j4 ^7 W6 L$ J
38; D4 O k4 [+ P% A. n: r
39+ A7 |' W- [( \4 c. A0 v
40 2 R1 _, S9 t* q' S4 f5 O41 ' X! x3 C1 ?# Y+ R1 m- B42 5 R& i) f% h8 H U, m3 o+ o/ c$ G43 + D8 G( f9 M( P2 W44+ Z; b Q& @. u- E1 B# P5 C
45: x9 K; Q/ C. D$ \% f2 ^- X
46 & z9 m! M' c# T! U47& m! j2 ?' ~; M7 F. { |0 j
48 . B f- L7 L: \2 z495 j& @4 S2 X8 D5 o
50 ( d0 K* }' M# H2 w# J" r51& c9 V' ]/ W/ k# P* R, M9 }9 x; \
52 + f% P: f: \# T' M* b. p7 x2 A% o53 9 O- h! }$ H1 K B$ Q/ Q54 b: o. }7 p! i1 H+ T
557 u8 n& L9 B3 ~
56 ( S5 }& V5 B! A$ E57 * m( ^" h0 X$ C588 V4 {' z( W. t, A, R( Y/ I8 P
59( j+ O# n0 F( O* F/ _
60 , j' W J% x6 {( h+ d9 ?1 h3 B) C( Q1 P6 u8 ^% O! R5 V- v" P
% p. Q* D8 y; }0 {2 S$ n
/ r6 X3 D; U0 y( C* Q1 } 3 j4 [* z( j8 x+ a' s. T. n. [0 h9 B" h+ R9 G( V! V' n: x* Z
7 y6 x4 T+ ?# r2 k2 q1 P- P
6 ^0 G1 K$ ~& t8 E9 y) }* S
) Z- f7 g; x9 f$ E: H+ l/ W